From 7610047be9073cbce59c2f44b09bbc1bfdb46879 Mon Sep 17 00:00:00 2001
From: Marco van den Boom <rob6874@robeco.nl>
Date: Thu, 25 Jan 2024 10:44:34 +0100
Subject: [PATCH 1/2] drop add_traces

---
 atom/plots/baseplot.py                 |  11 +-
 atom/plots/dataplot.py                 | 478 +++++++++------
 atom/plots/hyperparametertuningplot.py | 302 +++++----
 atom/plots/predictionplot.py           | 806 ++++++++++++-------------
 4 files changed, 802 insertions(+), 795 deletions(-)
diff --git a/atom/plots/baseplot.py b/atom/plots/baseplot.py
index b19a0e58d..faa5fbb72 100644
--- a/atom/plots/baseplot.py
+++ b/atom/plots/baseplot.py
@@ -392,8 +392,8 @@ def _draw_line(
         child: str | None = None,
         legend: Legend | dict[str, Any] | None = None,
         **kwargs,
-    ) -> go.Scatter:
-        """Draw a line.
+    ):
+        """Draw a line on the current figure.
 
         Unify the style to draw a line, where parent and child
         (e.g., model - data set or column - distribution) keep the
@@ -414,13 +414,8 @@ def _draw_line(
         **kwargs
             Additional keyword arguments for the trace.
 
-        Returns
-        -------
-        go.Scatter
-            New trace to add to figure.
-
         """
-        return go.Scatter(
+        Baseplot._fig.figure.add_scatter(
             line=kwargs.pop(
                 "line", {
                     "width": self.line_width,
diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py
index 2b2ce8328..de26e44b5 100644
--- a/atom/plots/dataplot.py
+++ b/atom/plots/dataplot.py
@@ -10,9 +10,10 @@
 from abc import ABCMeta
 from pathlib import Path
 from typing import Any, Literal
-
+from statsmodels.tsa.stattools import pacf
 import numpy as np
 import pandas as pd
+from sklearn.utils.metaestimators import available_if
 import plotly.graph_objects as go
 from beartype import beartype
 from nltk.collocations import (
@@ -29,7 +30,7 @@
     Segment, Sequence, Series,
 )
 from atom.utils.utils import (
-    check_dependency, crash, divide, get_corpus, lst, replace_missing, rnd,
+    check_dependency, crash, divide, get_corpus, lst, replace_missing, rnd, has_task
 )
 
 
@@ -136,22 +137,20 @@ def plot_components(
         color = BasePlot._fig.get_elem("components")
         opacity = [0.2] * self.pca_._comps + [0] * (len(variance) - self.pca_._comps)
 
-        fig.add_trace(
-            go.Bar(
-                x=variance,
-                y=[f"pca{i}" for i in range(len(variance))],
-                orientation="h",
-                marker={
-                    "color": [f"rgba({color[4:-1]}, {o})" for o in opacity],
-                    "line": {"width": 2, "color": color},
-                },
-                hovertemplate="%{x}<extra></extra>",
-                name=f"Variance retained: {variance[:self.pca_._comps].sum():.3f}",
-                legendgroup="components",
-                showlegend=BasePlot._fig.showlegend("components", legend),
-                xaxis=xaxis,
-                yaxis=yaxis,
-            )
+        fig.add_bar(
+            x=variance,
+            y=[f"pca{i}" for i in range(len(variance))],
+            orientation="h",
+            marker={
+                "color": [f"rgba({color[4:-1]}, {o})" for o in opacity],
+                "line": {"width": 2, "color": color},
+            },
+            hovertemplate="%{x}<extra></extra>",
+            name=f"Variance retained: {variance[:self.pca_._comps].sum():.3f}",
+            legendgroup="components",
+            showlegend=BasePlot._fig.showlegend("components", legend),
+            xaxis=xaxis,
+            yaxis=yaxis,
         )
 
         fig.update_layout({f"yaxis{yaxis[1:]}": {"categoryorder": "total ascending"}})
@@ -264,18 +263,16 @@ def plot_correlation(
             },
         )
 
-        fig.add_trace(
-            go.Heatmap(
-                z=corr.mask(mask),
-                x=columns_c,
-                y=columns_c,
-                coloraxis=f"coloraxis{xaxis[1:]}",
-                hovertemplate="x:%{x}<br>y:%{y}<br>z:%{z}<extra></extra>",
-                hoverongaps=False,
-                showlegend=False,
-                xaxis=xaxis,
-                yaxis=yaxis,
-            )
+        fig.add_heatmap(
+            z=corr.mask(mask),
+            x=columns_c,
+            y=columns_c,
+            coloraxis=f"coloraxis{xaxis[1:]}",
+            hovertemplate="x:%{x}<br>y:%{y}<br>z:%{z}<extra></extra>",
+            hoverongaps=False,
+            showlegend=False,
+            xaxis=xaxis,
+            yaxis=yaxis,
         )
 
         fig.update_layout(
@@ -412,21 +409,19 @@ def plot_distribution(
             show_c = self._get_show(show, len(series))
 
             color = BasePlot._fig.get_elem()
-            fig.add_trace(
-                go.Bar(
-                    x=series,
-                    y=series.index,
-                    orientation="h",
-                    marker={
-                        "color": f"rgba({color[4:-1]}, 0.2)",
-                        "line": {"width": 2, "color": color},
-                    },
-                    hovertemplate="%{x}<extra></extra>",
-                    name=f"{columns_c[0]}: {len(series)} classes",
-                    showlegend=BasePlot._fig.showlegend("dist", legend),
-                    xaxis=xaxis,
-                    yaxis=yaxis,
-                )
+            fig.add_bar(
+                x=series,
+                y=series.index,
+                orientation="h",
+                marker={
+                    "color": f"rgba({color[4:-1]}, 0.2)",
+                    "line": {"width": 2, "color": color},
+                },
+                hovertemplate="%{x}<extra></extra>",
+                name=f"{columns_c[0]}: {len(series)} classes",
+                showlegend=BasePlot._fig.showlegend("dist", legend),
+                xaxis=xaxis,
+                yaxis=yaxis,
             )
 
             return self._plot(
@@ -443,22 +438,20 @@ def plot_distribution(
 
         else:
             for col in [c for c in columns_c if c in num_columns]:
-                fig.add_trace(
-                    go.Histogram(
-                        x=self.branch.dataset[col],
-                        histnorm="probability density",
-                        marker={
-                            "color": f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)",
-                            "line": {"width": 2, "color": BasePlot._fig.get_elem(col)},
-                        },
-                        nbinsx=40,
-                        name="dist",
-                        legendgroup=col,
-                        legendgrouptitle={"text": col, "font_size": self.label_fontsize},
-                        showlegend=BasePlot._fig.showlegend(f"{col}-dist", legend),
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_histogram(
+                    x=self.branch.dataset[col],
+                    histnorm="probability density",
+                    marker={
+                        "color": f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)",
+                        "line": {"width": 2, "color": BasePlot._fig.get_elem(col)},
+                    },
+                    nbinsx=40,
+                    name="dist",
+                    legendgroup=col,
+                    legendgrouptitle={"text": col, "font_size": self.label_fontsize},
+                    showlegend=BasePlot._fig.showlegend(f"{col}-dist", legend),
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
                 x = np.linspace(
@@ -480,16 +473,14 @@ def plot_distribution(
                             params = getattr(stats, dist).fit(values)
                             y = getattr(stats, dist).pdf(x, *params)
 
-                        fig.add_trace(
-                            self._draw_line(
-                                x=x,
-                                y=y,
-                                parent=col,
-                                child=dist,
-                                legend=legend,
-                                xaxis=xaxis,
-                                yaxis=yaxis,
-                            )
+                        self._draw_line(
+                            x=x,
+                            y=y,
+                            parent=col,
+                            child=dist,
+                            legend=legend,
+                            xaxis=xaxis,
+                            yaxis=yaxis,
                         )
 
             fig.update_layout({"barmode": "overlay"})
@@ -650,22 +641,20 @@ def get_text(column: Series) -> Series:
         fig = self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
-        fig.add_trace(
-            go.Bar(
-                x=(data := series[-self._get_show(show, len(series)):]),
-                y=data.index,
-                orientation="h",
-                marker={
-                    "color": f"rgba({BasePlot._fig.get_elem(ngram_c)[4:-1]}, 0.2)",
-                    "line": {"width": 2, "color": BasePlot._fig.get_elem(ngram_c)},
-                },
-                hovertemplate="%{x}<extra></extra>",
-                name=f"Total {ngram_c}: {len(series)}",
-                legendgroup=ngram_c,
-                showlegend=BasePlot._fig.showlegend(ngram_c, legend),
-                xaxis=xaxis,
-                yaxis=yaxis,
-            )
+        fig.add_bar(
+            x=(data := series[-self._get_show(show, len(series)):]),
+            y=data.index,
+            orientation="h",
+            marker={
+                "color": f"rgba({BasePlot._fig.get_elem(ngram_c)[4:-1]}, 0.2)",
+                "line": {"width": 2, "color": BasePlot._fig.get_elem(ngram_c)},
+            },
+            hovertemplate="%{x}<extra></extra>",
+            name=f"Total {ngram_c}: {len(series)}",
+            legendgroup=ngram_c,
+            showlegend=BasePlot._fig.showlegend(ngram_c, legend),
+            xaxis=xaxis,
+            yaxis=yaxis,
         )
 
         return self._plot(
@@ -679,6 +668,131 @@ def get_text(column: Series) -> Series:
             display=display,
         )
 
+    @available_if(has_task("forecast"))
+    @crash
+    def plot_pacf(
+        self,
+        columns: ColumnSelector | None = None,
+        show: IntLargerZero | None = 10,
+        *,
+        title: str | dict[str, Any] | None = None,
+        legend: Legend | dict[str, Any] | None = "lower right",
+        figsize: tuple[IntLargerZero, IntLargerZero] | None = None,
+        filename: str | Path | None = None,
+        display: Bool | None = True,
+    ) -> go.Figure | None:
+        """Plot the partial autocorrelation function.
+
+        Missing values are ignored.
+
+        !!! tip
+            Use atom's [decompose][atomforecaster-decompose] method to
+            remove trend and seasonality from the data.
+
+        Parameters
+        ----------
+        columns: int, str, segment, sequence, dataframe or None, default=None
+            Columns to plot the pacf from. If None, it selects the
+            target column.
+
+        show: int or None, default=10
+            Number of n-grams (ordered by number of occurrences) to
+            show in the plot. If none, show all n-grams (up to 200).
+
+        title: str, dict or None, default=None
+            Title for the plot.
+
+            - If None, no title is shown.
+            - If str, text for the title.
+            - If dict, [title configuration][parameters].
+
+        legend: str, dict or None, default="lower right"
+            Legend for the plot. See the [user guide][parameters] for
+            an extended description of the choices.
+
+            - If None: No legend is shown.
+            - If str: Location where to show the legend.
+            - If dict: Legend configuration.
+
+        figsize: tuple or None, default=None
+            Figure's size in pixels, format as (x, y). If None, it
+            adapts the size to the number of n-grams shown.
+
+        filename: str, Path or None, default=None
+            Save the plot using this name. Use "auto" for automatic
+            naming. The type of the file depends on the provided name
+            (.html, .png, .pdf, etc...). If `filename` has no file type,
+            the plot is saved as html. If None, the plot is not saved.
+
+        display: bool or None, default=True
+            Whether to render the plot. If None, it returns the figure.
+
+        Returns
+        -------
+        [go.Figure][] or None
+            Plot object. Only returned if `display=None`.
+
+        See Also
+        --------
+        atom.plots:DataPlot.plot_acf
+        atom.plots:DataPlot.plot_decomposition
+        atom.plots:DataPlot.plot_ttf
+
+        Examples
+        --------
+        ```pycon
+        from atom import ATOMForecaster
+        from sktime.datasets import load_airline
+
+        y = load_airline()
+
+        atom = ATOMForecaster(y, random_state=1)
+        atom.plot_pacf()
+        ```
+
+        """
+        if columns is None:
+            columns_c = lst(self.branch.target)
+        else:
+            columns_c = self.branch._get_columns(columns)
+        show_c = self._get_show(show)
+
+        fig = self._get_figure()
+        xaxis, yaxis = BasePlot._fig.get_axes()
+
+        for col in columns_c:
+            corr_array = pacf(self.branch.dataset[col].dropna(), nlags=10, alpha=0.05)
+
+            lower_y = corr_array[1][:, 0] - corr_array[0]
+            upper_y = corr_array[1][:, 1] - corr_array[0]
+
+            for x in range(len(corr_array[0])):
+                fig.add_scatter(x=(x, x), y=(0, corr_array[0][x]), mode='lines', line_color='#3f3f3f', xaxis=xaxis, yaxis=yaxis)
+
+            fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers',
+                            marker_color='#1f77b4',
+                            marker_size=12, xaxis=xaxis, yaxis=yaxis)
+            fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines',
+                            line_color='rgba(255,255,255,0)', xaxis=xaxis, yaxis=yaxis)
+            fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',
+                            fillcolor='rgba(32, 146, 230,0.3)',
+                            fill='tonexty', line_color='rgba(255,255,255,0)', xaxis=xaxis, yaxis=yaxis)
+
+            fig.update_traces(showlegend=False)
+            # fig.update_xaxes(range=[-1, 42])
+            fig.update_yaxes(zerolinecolor="black")
+
+        return self._plot(
+            ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+            xlabel="Lag",
+            title=title,
+            legend=legend,
+            figsize=figsize or (900, 400 + show_c * 50),
+            plotname="plot_pacf",
+            filename=filename,
+            display=display,
+        )
+
     @crash
     def plot_pca(
         self,
@@ -761,23 +875,21 @@ def plot_pca(
 
         fig = self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
-        fig.add_trace(
-            go.Scatter(
-                x=tuple(range(1, self.pca_.n_features_in_ + 1)),
-                y=np.cumsum(self.pca_.explained_variance_ratio_),
-                mode="lines+markers",
-                line={"width": self.line_width, "color": BasePlot._fig.get_elem("pca")},
-                marker={
-                    "symbol": symbols,
-                    "size": sizes,
-                    "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
-                    "opacity": 1,
-                },
-                hovertemplate="%{y}<extra></extra>",
-                showlegend=False,
-                xaxis=xaxis,
-                yaxis=yaxis,
-            )
+        fig.add_scatter(
+            x=tuple(range(1, self.pca_.n_features_in_ + 1)),
+            y=np.cumsum(self.pca_.explained_variance_ratio_),
+            mode="lines+markers",
+            line={"width": self.line_width, "color": BasePlot._fig.get_elem("pca")},
+            marker={
+                "symbol": symbols,
+                "size": sizes,
+                "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+                "opacity": 1,
+            },
+            hovertemplate="%{y}<extra></extra>",
+            showlegend=False,
+            xaxis=xaxis,
+            yaxis=yaxis,
         )
 
         fig.update_layout(
@@ -821,7 +933,7 @@ def plot_qq(
 
         Parameters
         ----------
-        columns: int, str, slice or sequence, default=0
+        columns: int, str, segment, sequence or dataframe, default=0
             Columns to plot. Selected categorical columns are ignored.
 
         distributions: str or sequence, default="norm"
@@ -896,17 +1008,15 @@ def plot_qq(
                 params = stat.fit(values)
                 samples = stat.rvs(*params, size=101, random_state=self.random_state)
 
-                fig.add_trace(
-                    self._draw_line(
-                        x=(x := np.percentile(samples, percentiles)),
-                        y=(y := np.percentile(values, percentiles)),
-                        mode="markers",
-                        parent=col,
-                        child=dist,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=(x := np.percentile(samples, percentiles)),
+                    y=(y := np.percentile(values, percentiles)),
+                    mode="markers",
+                    parent=col,
+                    child=dist,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         self._draw_straight_line((x, y), y="diagonal", xaxis=xaxis, yaxis=yaxis)
@@ -1027,43 +1137,37 @@ def plot_relationships(
             )
 
             if x == y:
-                fig.add_trace(
-                    go.Histogram(
-                        x=self.branch.dataset[columns_c[x]],
-                        marker={
-                            "color": f"rgba({color[4:-1]}, 0.2)",
-                            "line": {"width": 2, "color": color},
-                        },
-                        name=columns_c[x],
-                        showlegend=False,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_histogram(
+                    x=self.branch.dataset[columns_c[x]],
+                    marker={
+                        "color": f"rgba({color[4:-1]}, 0.2)",
+                        "line": {"width": 2, "color": color},
+                    },
+                    name=columns_c[x],
+                    showlegend=False,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
             elif x > y:
-                fig.add_trace(
-                    go.Scatter(
-                        x=sample(columns_c[y]),
-                        y=sample(columns_c[x]),
-                        mode="markers",
-                        marker={"color": color},
-                        hovertemplate="(%{x}, %{y})<extra></extra>",
-                        showlegend=False,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_scatter(
+                    x=sample(columns_c[y]),
+                    y=sample(columns_c[x]),
+                    mode="markers",
+                    marker={"color": color},
+                    hovertemplate="(%{x}, %{y})<extra></extra>",
+                    showlegend=False,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
             elif y > x:
-                fig.add_trace(
-                    go.Histogram2dContour(
-                        x=self.branch.dataset[columns_c[y]],
-                        y=self.branch.dataset[columns_c[x]],
-                        coloraxis=f"coloraxis{xaxis[1:]}",
-                        hovertemplate="x:%{x}<br>y:%{y}<br>z:%{z}<extra></extra>",
-                        showlegend=False,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_histogram2dcontour(
+                    x=self.branch.dataset[columns_c[y]],
+                    y=self.branch.dataset[columns_c[x]],
+                    coloraxis=f"coloraxis{xaxis[1:]}",
+                    hovertemplate="x:%{x}<br>y:%{y}<br>z:%{z}<extra></extra>",
+                    showlegend=False,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
             if x < len(columns_c) - 1:
@@ -1181,24 +1285,22 @@ def plot_rfecv(
         mean = self.rfecv_.cv_results_["mean_test_score"]
         std = self.rfecv_.cv_results_["std_test_score"]
 
-        fig.add_trace(
-            go.Scatter(
-                x=list(x),
-                y=mean,
-                mode="lines+markers",
-                line={"width": self.line_width, "color": BasePlot._fig.get_elem("rfecv")},
-                marker={
-                    "symbol": symbols,
-                    "size": sizes,
-                    "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
-                    "opacity": 1,
-                },
-                name=ylabel,
-                legendgroup="rfecv",
-                showlegend=BasePlot._fig.showlegend("rfecv", legend),
-                xaxis=xaxis,
-                yaxis=yaxis,
-            )
+        fig.add_scatter(
+            x=list(x),
+            y=mean,
+            mode="lines+markers",
+            line={"width": self.line_width, "color": BasePlot._fig.get_elem("rfecv")},
+            marker={
+                "symbol": symbols,
+                "size": sizes,
+                "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+                "opacity": 1,
+            },
+            name=ylabel,
+            legendgroup="rfecv",
+            showlegend=BasePlot._fig.showlegend("rfecv", legend),
+            xaxis=xaxis,
+            yaxis=yaxis,
         )
 
         # Add error bands
@@ -1340,22 +1442,20 @@ def plot_series(
 
         for col in columns_c:
             for child, ds in self._get_set(rows):
-                fig.add_trace(
-                    self._draw_line(
-                        x=self._get_plot_index(y := self.branch._get_rows(ds)[col]),
-                        y=y,
-                        mode="lines+markers",
-                        marker={
-                            "size": self.marker_size,
-                            "color": BasePlot._fig.get_elem(col),
-                            "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
-                        },
-                        parent=col,
-                        child=child,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=self._get_plot_index(y := self.branch._get_rows(ds)[col]),
+                    y=y,
+                    mode="lines+markers",
+                    marker={
+                        "size": self.marker_size,
+                        "color": BasePlot._fig.get_elem(col),
+                        "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+                    },
+                    parent=col,
+                    child=child,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         return self._plot(
@@ -1476,13 +1576,11 @@ def get_text(column):
         fig = self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
-        fig.add_trace(
-            go.Image(
-                z=wordcloud.generate(get_text(rows_c[corpus])),
-                hoverinfo="skip",
-                xaxis=xaxis,
-                yaxis=yaxis,
-            )
+        fig.add_image(
+            z=wordcloud.generate(get_text(rows_c[corpus])),
+            hoverinfo="skip",
+            xaxis=xaxis,
+            yaxis=yaxis,
         )
 
         fig.update_layout(
diff --git a/atom/plots/hyperparametertuningplot.py b/atom/plots/hyperparametertuningplot.py
index dd799aab2..e520a6e03 100644
--- a/atom/plots/hyperparametertuningplot.py
+++ b/atom/plots/hyperparametertuningplot.py
@@ -254,16 +254,14 @@ def plot_edf(
         for m in models_c:
             for met in metric_c:
                 y = np.sum(m.trials[met].values[:, np.newaxis] <= x, axis=0)
-                fig.add_trace(
-                    self._draw_line(
-                        x=x,
-                        y=y / len(m.trials),
-                        parent=m.name,
-                        child=met,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=x,
+                    y=y / len(m.trials),
+                    parent=m.name,
+                    child=met,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         BasePlot._fig.used_models.extend(models_c)
@@ -379,22 +377,20 @@ def plot_hyperparameter_importance(
             fanova = FanovaImportanceEvaluator(seed=self.random_state)
             importances = fanova.evaluate(m.study, target=self._optuna_target(metric_c))
 
-            fig.add_trace(
-                go.Bar(
-                    x=np.array(list(importances.values())) / sum(importances.values()),
-                    y=list(importances),
-                    orientation="h",
-                    marker={
-                        "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
-                        "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
-                    },
-                    hovertemplate="%{x}<extra></extra>",
-                    name=m.name,
-                    legendgroup=m.name,
-                    showlegend=BasePlot._fig.showlegend(m.name, legend),
-                    xaxis=xaxis,
-                    yaxis=yaxis,
-                )
+            fig.add_bar(
+                x=np.array(list(importances.values())) / sum(importances.values()),
+                y=list(importances),
+                orientation="h",
+                marker={
+                    "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+                    "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
+                },
+                hovertemplate="%{x}<extra></extra>",
+                name=m.name,
+                legendgroup=m.name,
+                showlegend=BasePlot._fig.showlegend(m.name, legend),
+                xaxis=xaxis,
+                yaxis=yaxis,
             )
 
         fig.update_layout(
@@ -534,46 +530,42 @@ def plot_hyperparameters(
                     },
                 )
 
-                fig.add_trace(
-                    go.Scatter(
-                        x=model.trials[params_c[y]],
-                        y=model.trials[params_c[x + 1]],
-                        mode="markers",
-                        marker={
-                            "size": self.marker_size,
-                            "color": BasePlot._fig.get_elem(model.name),
-                            "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
-                        },
-                        customdata=list(
-                            zip(model.trials.index, model.trials[metric_c], strict=True)
-                        ),
-                        hovertemplate=(
-                            f"{params_c[y]}:%{{x}}<br>"
-                            f"{params_c[x + 1]}:%{{y}}<br>"
-                            f"{metric_c}:%{{customdata[1]:.4f}}"
-                            "<extra>Trial %{customdata[0]}</extra>"
-                        ),
-                        showlegend=False,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_scatter(
+                    x=model.trials[params_c[y]],
+                    y=model.trials[params_c[x + 1]],
+                    mode="markers",
+                    marker={
+                        "size": self.marker_size,
+                        "color": BasePlot._fig.get_elem(model.name),
+                        "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+                    },
+                    customdata=list(
+                        zip(model.trials.index, model.trials[metric_c], strict=True)
+                    ),
+                    hovertemplate=(
+                        f"{params_c[y]}:%{{x}}<br>"
+                        f"{params_c[x + 1]}:%{{y}}<br>"
+                        f"{metric_c}:%{{customdata[1]:.4f}}"
+                        "<extra>Trial %{customdata[0]}</extra>"
+                    ),
+                    showlegend=False,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
-                fig.add_trace(
-                    go.Contour(
-                        x=model.trials[params_c[y]],
-                        y=model.trials[params_c[x + 1]],
-                        z=model.trials[metric_c],
-                        contours={
-                            "showlabels": True,
-                            "labelfont": {"size": self.tick_fontsize, "color": "white"},
-                        },
-                        coloraxis="coloraxis99",
-                        hoverinfo="skip",
-                        showlegend=False,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_contour(
+                    x=model.trials[params_c[y]],
+                    y=model.trials[params_c[x + 1]],
+                    z=model.trials[metric_c],
+                    contours={
+                        "showlabels": True,
+                        "labelfont": {"size": self.tick_fontsize, "color": "white"},
+                    },
+                    coloraxis="coloraxis99",
+                    hoverinfo="skip",
+                    showlegend=False,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
                 if _is_log_scale(model.study.trials, params_c[y]):
@@ -762,17 +754,15 @@ def sort_mixed_types(values: list[str]) -> list[str]:
             }
         )
 
-        fig.add_trace(
-            go.Parcoords(
-                dimensions=dims,
-                line={
-                    "color": dims[0]["values"],
-                    "coloraxis": f"coloraxis{xaxis[1:]}",
-                },
-                unselected={"line": {"color": "gray", "opacity": 0.5}},
-                labelside="bottom",
-                labelfont={"size": self.label_fontsize},
-            )
+        fig.add_parcoords(
+            dimensions=dims,
+            line={
+                "color": dims[0]["values"],
+                "coloraxis": f"coloraxis{xaxis[1:]}",
+            },
+            unselected={"line": {"color": "gray", "opacity": 0.5}},
+            labelside="bottom",
+            labelfont={"size": self.label_fontsize},
         )
 
         BasePlot._fig.used_models.append(model)
@@ -904,22 +894,20 @@ def plot_pareto_front(
                     y=(y_pos, rnd(y_pos + size)),
                 )
 
-                fig.add_trace(
-                    go.Scatter(
-                        x=model.trials[metric_c[y]],
-                        y=model.trials[metric_c[x + 1]],
-                        mode="markers",
-                        marker={
-                            "size": self.marker_size,
-                            "color": model.trials.index,
-                            "colorscale": "Teal",
-                            "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
-                        },
-                        customdata=model.trials.index,
-                        hovertemplate="(%{x}, %{y})<extra>Trial %{customdata}</extra>",
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_scatter(
+                    x=model.trials[metric_c[y]],
+                    y=model.trials[metric_c[x + 1]],
+                    mode="markers",
+                    marker={
+                        "size": self.marker_size,
+                        "color": model.trials.index,
+                        "colorscale": "Teal",
+                        "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+                    },
+                    customdata=model.trials.index,
+                    hovertemplate="(%{x}, %{y})<extra>Trial %{customdata}</extra>",
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
                 if x < length - 1:
@@ -1057,22 +1045,20 @@ def plot_slice(
                 y=(y_pos, rnd(y_pos + y_size)),
             )
 
-            fig.add_trace(
-                go.Scatter(
-                    x=model.trials[params_c[y]],
-                    y=model.trials[metric_c[x]],
-                    mode="markers",
-                    marker={
-                        "size": self.marker_size,
-                        "color": model.trials.index,
-                        "colorscale": "Teal",
-                        "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
-                    },
-                    customdata=model.trials.index,
-                    hovertemplate="(%{x}, %{y})<extra>Trial %{customdata}</extra>",
-                    xaxis=xaxis,
-                    yaxis=yaxis,
-                )
+            fig.add_scatter(
+                x=model.trials[params_c[y]],
+                y=model.trials[metric_c[x]],
+                mode="markers",
+                marker={
+                    "size": self.marker_size,
+                    "color": model.trials.index,
+                    "colorscale": "Teal",
+                    "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+                },
+                customdata=model.trials.index,
+                hovertemplate="(%{x}, %{y})<extra>Trial %{customdata}</extra>",
+                xaxis=xaxis,
+                yaxis=yaxis,
             )
 
             if _is_log_scale(model.study.trials, params_c[y]):
@@ -1205,17 +1191,15 @@ def plot_terminator_improvement(
                     "(e.g., using ht_params={'cv': 5}) on a single-metric optimization."
                 )
 
-            fig.add_trace(
-                self._draw_line(
-                    x=m.trials.index,
-                    y=info.improvements,
-                    error_y={"type": "data", "array": info.errors},
-                    mode="markers+lines",
-                    parent=m.name,
-                    legend=legend,
-                    xaxis=xaxis,
-                    yaxis=yaxis,
-                )
+            self._draw_line(
+                x=m.trials.index,
+                y=info.improvements,
+                error_y={"type": "data", "array": info.errors},
+                mode="markers+lines",
+                parent=m.name,
+                legend=legend,
+                xaxis=xaxis,
+                yaxis=yaxis,
             )
 
         BasePlot._fig.used_models.extend(models_c)
@@ -1350,24 +1334,22 @@ def plot_timeline(
 
             for state in sorted(TrialState, key=lambda x: x.name):
                 if bars := list(filter(lambda x: x.state == state, info)):
-                    fig.add_trace(
-                        go.Bar(
-                            name=state.name,
-                            x=[b.duration for b in bars],
-                            y=[b.number for b in bars],
-                            base=[b.start.isoformat() for b in bars],
-                            text=[b.hovertext for b in bars],
-                            textposition="none",
-                            hovertemplate=f"%{{text}}<extra>{m.name}</extra>",
-                            orientation="h",
-                            marker={
-                                "color": f"rgba({_cm[state.name][4:-1]}, 0.2)",
-                                "line": {"width": 2, "color": _cm[state.name]},
-                            },
-                            showlegend=BasePlot._fig.showlegend(_cm[state.name], legend),
-                            xaxis=xaxis,
-                            yaxis=yaxis,
-                        )
+                    fig.add_bar(
+                        name=state.name,
+                        x=[b.duration for b in bars],
+                        y=[b.number for b in bars],
+                        base=[b.start.isoformat() for b in bars],
+                        text=[b.hovertext for b in bars],
+                        textposition="none",
+                        hovertemplate=f"%{{text}}<extra>{m.name}</extra>",
+                        orientation="h",
+                        marker={
+                            "color": f"rgba({_cm[state.name][4:-1]}, 0.2)",
+                            "line": {"width": 2, "color": _cm[state.name]},
+                        },
+                        showlegend=BasePlot._fig.showlegend(_cm[state.name], legend),
+                        xaxis=xaxis,
+                        yaxis=yaxis,
                     )
 
         fig.update_layout({f"xaxis{yaxis[1:]}_type": "date", "barmode": "group"})
@@ -1485,34 +1467,30 @@ def plot_trials(
                 sizes = [self.marker_size] * len(m.trials)
                 sizes[m.best_trial.number] = self.marker_size * 1.5
 
-                fig.add_trace(
-                    self._draw_line(
-                        x=m.trials.index,
-                        y=m.trials[met],
-                        mode="lines+markers",
-                        marker_symbol=symbols,
-                        marker_size=sizes,
-                        hovertemplate=None,
-                        parent=m.name,
-                        child=self._metric[met].name,
-                        legend=legend,
-                        xaxis=xaxis2,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=m.trials.index,
+                    y=m.trials[met],
+                    mode="lines+markers",
+                    marker_symbol=symbols,
+                    marker_size=sizes,
+                    hovertemplate=None,
+                    parent=m.name,
+                    child=self._metric[met].name,
+                    legend=legend,
+                    xaxis=xaxis2,
+                    yaxis=yaxis,
                 )
 
-                fig.add_trace(
-                    self._draw_line(
-                        x=m.trials.index,
-                        y=m.trials[met].diff(),
-                        mode="lines+markers",
-                        marker_symbol="circle",
-                        parent=m.name,
-                        child=self._metric[met].name,
-                        legend=legend,
-                        xaxis=xaxis2,
-                        yaxis=yaxis2,
-                    )
+                self._draw_line(
+                    x=m.trials.index,
+                    y=m.trials[met].diff(),
+                    mode="lines+markers",
+                    marker_symbol="circle",
+                    parent=m.name,
+                    child=self._metric[met].name,
+                    legend=legend,
+                    xaxis=xaxis2,
+                    yaxis=yaxis2,
                 )
 
         fig.update_layout(
diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py
index f32954740..69f5fcd25 100644
--- a/atom/plots/predictionplot.py
+++ b/atom/plots/predictionplot.py
@@ -165,44 +165,40 @@ def plot_bootstrap(
 
         for met in metric_c:
             if any(m._bootstrap is None for m in models_c):
-                fig.add_trace(
-                    go.Bar(
-                        x=[m._best_score(met) for m in models_c],
-                        y=[m.name for m in models_c],
-                        error_x={
-                            "type": "data",
-                            "array": [
-                                0 if m._bootstrap is None else m.bootstrap.loc[:, met].std()
-                                for m in models_c
-                            ],
-                        },
-                        orientation="h",
-                        marker={
-                            "color": f"rgba({BasePlot._fig.get_elem(met)[4:-1]}, 0.2)",
-                            "line": {"width": 2, "color": BasePlot._fig.get_elem(met)},
-                        },
-                        hovertemplate="%{x}<extra></extra>",
-                        name=met,
-                        legendgroup=met,
-                        showlegend=BasePlot._fig.showlegend(met, legend),
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_bar(
+                    x=[m._best_score(met) for m in models_c],
+                    y=[m.name for m in models_c],
+                    error_x={
+                        "type": "data",
+                        "array": [
+                            0 if m._bootstrap is None else m.bootstrap.loc[:, met].std()
+                            for m in models_c
+                        ],
+                    },
+                    orientation="h",
+                    marker={
+                        "color": f"rgba({BasePlot._fig.get_elem(met)[4:-1]}, 0.2)",
+                        "line": {"width": 2, "color": BasePlot._fig.get_elem(met)},
+                    },
+                    hovertemplate="%{x}<extra></extra>",
+                    name=met,
+                    legendgroup=met,
+                    showlegend=BasePlot._fig.showlegend(met, legend),
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
             else:
-                fig.add_trace(
-                    go.Box(
-                        x=np.ravel([m.bootstrap.loc[:, met] for m in models_c]),
-                        y=np.ravel([[m.name] * len(m.bootstrap) for m in models_c]),
-                        marker_color=BasePlot._fig.get_elem(met),
-                        boxpoints="outliers",
-                        orientation="h",
-                        name=met,
-                        legendgroup=met,
-                        showlegend=BasePlot._fig.showlegend(met, legend),
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_box(
+                    x=np.ravel([m.bootstrap.loc[:, met] for m in models_c]),
+                    y=np.ravel([[m.name] * len(m.bootstrap) for m in models_c]),
+                    marker_color=BasePlot._fig.get_elem(met),
+                    boxpoints="outliers",
+                    orientation="h",
+                    name=met,
+                    legendgroup=met,
+                    showlegend=BasePlot._fig.showlegend(met, legend),
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         fig.update_layout(
@@ -347,34 +343,30 @@ def plot_calibration(
                 # Get calibration (frac of positives and predicted values)
                 frac_pos, pred = calibration_curve(y_true, y_pred, n_bins=n_bins)
 
-                fig.add_trace(
-                    self._draw_line(
-                        x=pred,
-                        y=frac_pos,
-                        parent=m.name,
-                        child=child,
-                        mode="lines+markers",
-                        marker_symbol="circle",
-                        legend=legend,
-                        xaxis=xaxis2,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=pred,
+                    y=frac_pos,
+                    parent=m.name,
+                    child=child,
+                    mode="lines+markers",
+                    marker_symbol="circle",
+                    legend=legend,
+                    xaxis=xaxis2,
+                    yaxis=yaxis,
                 )
 
-                fig.add_trace(
-                    go.Histogram(
-                        x=y_pred,
-                        xbins={"start": 0, "end": 1, "size": 1.0 / n_bins},
-                        marker={
-                            "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
-                            "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
-                        },
-                        name=m.name,
-                        legendgroup=m.name,
-                        showlegend=False,
-                        xaxis=xaxis2,
-                        yaxis=yaxis2,
-                    )
+                fig.add_histogram(
+                    x=y_pred,
+                    xbins={"start": 0, "end": 1, "size": 1.0 / n_bins},
+                    marker={
+                        "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+                        "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
+                    },
+                    name=m.name,
+                    legendgroup=m.name,
+                    showlegend=False,
+                    xaxis=xaxis2,
+                    yaxis=yaxis2,
                 )
 
         self._draw_straight_line((pred, frac_pos), y="diagonal", xaxis=xaxis2, yaxis=yaxis)
@@ -544,27 +536,25 @@ def plot_confusion_matrix(
                     target_c, np.unique(m.branch.dataset[target_c]).astype(str)
                 )
 
-                fig.add_trace(
-                    go.Heatmap(
-                        x=ticks,
-                        y=ticks,
-                        z=100.0 * cm / cm.sum(axis=1)[:, np.newaxis],
-                        coloraxis=f"coloraxis{xaxis[1:]}",
-                        text=cm,
-                        customdata=labels,
-                        texttemplate="%{text}<br>(%{z:.2f}%)",
-                        textfont={"size": self.label_fontsize},
-                        hovertemplate=(
-                            "%{customdata}<extra></extra>"
-                            if self.task.is_binary
-                            else ""
-                            "Predicted label:%{x}<br>True label:%{y}<br>Percentage:%{z}"
-                            "<extra></extra>"
-                        ),
-                        showlegend=False,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_heatmap(
+                    x=ticks,
+                    y=ticks,
+                    z=100.0 * cm / cm.sum(axis=1)[:, np.newaxis],
+                    coloraxis=f"coloraxis{xaxis[1:]}",
+                    text=cm,
+                    customdata=labels,
+                    texttemplate="%{text}<br>(%{z:.2f}%)",
+                    textfont={"size": self.label_fontsize},
+                    hovertemplate=(
+                        "%{customdata}<extra></extra>"
+                        if self.task.is_binary
+                        else ""
+                        "Predicted label:%{x}<br>True label:%{y}<br>Percentage:%{z}"
+                        "<extra></extra>"
+                    ),
+                    showlegend=False,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
                 fig.update_layout(
@@ -577,22 +567,20 @@ def plot_confusion_matrix(
                 )
 
             else:
-                fig.add_trace(
-                    go.Bar(
-                        x=cm.ravel(),
-                        y=labels.ravel(),
-                        orientation="h",
-                        marker={
-                            "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
-                            "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
-                        },
-                        hovertemplate="%{x}<extra></extra>",
-                        name=m.name,
-                        legendgroup=m.name,
-                        showlegend=BasePlot._fig.showlegend(m.name, legend),
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_bar(
+                    x=cm.ravel(),
+                    y=labels.ravel(),
+                    orientation="h",
+                    marker={
+                        "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+                        "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
+                    },
+                    hovertemplate="%{x}<extra></extra>",
+                    name=m.name,
+                    legendgroup=m.name,
+                    showlegend=BasePlot._fig.showlegend(m.name, legend),
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
                 fig.update_layout(bargroupgap=0.05)
@@ -709,17 +697,15 @@ def plot_det(
                     *m._get_pred(ds, target, method=("decision_function", "predict_proba"))
                 )
 
-                fig.add_trace(
-                    self._draw_line(
-                        x=fpr,
-                        y=fnr,
-                        mode="lines",
-                        parent=m.name,
-                        child=child,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=fpr,
+                    y=fnr,
+                    mode="lines",
+                    parent=m.name,
+                    child=child,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         BasePlot._fig.used_models.extend(models_c)
@@ -833,17 +819,15 @@ def plot_errors(
             for child, ds in self._get_set(rows):
                 y_true, y_pred = m._get_pred(ds, target)
 
-                fig.add_trace(
-                    self._draw_line(
-                        x=y_true,
-                        y=y_pred,
-                        mode="markers",
-                        parent=m.name,
-                        child=child,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=y_true,
+                    y=y_pred,
+                    mode="markers",
+                    parent=m.name,
+                    child=child,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
                 # Fit the points using linear regression
@@ -852,17 +836,15 @@ def plot_errors(
                 model = OrdinaryLeastSquares(goal=self._goal)
                 estimator = model._get_est({}).fit(bk.DataFrame(y_true), y_pred)
 
-                fig.add_trace(
-                    self._draw_line(
-                        x=(x := np.linspace(y_true.min(), y_true.max(), 100)),
-                        y=estimator.predict(x[:, np.newaxis]),
-                        mode="lines",
-                        hovertemplate="(%{x}, %{y})<extra></extra>",
-                        parent=m.name,
-                        legend=None,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=(x := np.linspace(y_true.min(), y_true.max(), 100)),
+                    y=estimator.predict(x[:, np.newaxis]),
+                    mode="lines",
+                    hovertemplate="(%{x}, %{y})<extra></extra>",
+                    parent=m.name,
+                    legend=None,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         self._draw_straight_line((y_true, y_pred), y="diagonal", xaxis=xaxis, yaxis=yaxis)
@@ -972,17 +954,15 @@ def plot_evals(
                 )
 
             for ds in dataset.split("+"):
-                fig.add_trace(
-                    self._draw_line(
-                        x=list(range(len(m.evals[f"{self._metric[0].name}_{ds}"]))),
-                        y=m.evals[f"{self._metric[0].name}_{ds}"],
-                        marker_symbol="circle",
-                        parent=m.name,
-                        child=ds,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=list(range(len(m.evals[f"{self._metric[0].name}_{ds}"]))),
+                    y=m.evals[f"{self._metric[0].name}_{ds}"],
+                    marker_symbol="circle",
+                    parent=m.name,
+                    child=ds,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         BasePlot._fig.used_models.extend(models_c)
@@ -1094,22 +1074,20 @@ def plot_feature_importance(
                     "nor coef_ attribute."
                 ) from None
 
-            fig.add_trace(
-                go.Bar(
-                    x=fi,
-                    y=fi.index,
-                    orientation="h",
-                    marker={
-                        "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
-                        "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
-                    },
-                    hovertemplate="%{x}<extra></extra>",
-                    name=m.name,
-                    legendgroup=m.name,
-                    showlegend=BasePlot._fig.showlegend(m.name, legend),
-                    xaxis=xaxis,
-                    yaxis=yaxis,
-                )
+            fig.add_bar(
+                x=fi,
+                y=fi.index,
+                orientation="h",
+                marker={
+                    "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+                    "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
+                },
+                hovertemplate="%{x}<extra></extra>",
+                name=m.name,
+                legendgroup=m.name,
+                showlegend=BasePlot._fig.showlegend(m.name, legend),
+                xaxis=xaxis,
+                yaxis=yaxis,
             )
 
         fig.update_layout(
@@ -1272,30 +1250,26 @@ def plot_forecast(
 
             y_true = m.branch._all.loc[y_pred.index, target_c]
 
-            fig.add_trace(
-                self._draw_line(
-                    x=(x := self._get_plot_index(y_pred)),
-                    y=y_pred,
-                    mode="lines+markers",
-                    parent=m.name,
-                    legend=legend,
-                    xaxis=xaxis2,
-                    yaxis=yaxis,
-                )
+            self._draw_line(
+                x=(x := self._get_plot_index(y_pred)),
+                y=y_pred,
+                mode="lines+markers",
+                parent=m.name,
+                legend=legend,
+                xaxis=xaxis2,
+                yaxis=yaxis,
             )
 
             # Draw residuals
-            fig.add_trace(
-                self._draw_line(
-                    x=x,
-                    y=np.subtract(y_true, y_pred),
-                    mode="lines+markers",
-                    parent=m.name,
-                    legend=legend,
-                    showlegend=False,
-                    xaxis=xaxis2,
-                    yaxis=yaxis2,
-                )
+            self._draw_line(
+                x=x,
+                y=np.subtract(y_true, y_pred),
+                mode="lines+markers",
+                parent=m.name,
+                legend=legend,
+                showlegend=False,
+                xaxis=xaxis2,
+                yaxis=yaxis2,
             )
 
             if plot_interval:
@@ -1343,17 +1317,15 @@ def plot_forecast(
                 )
 
         # Draw original time series
-        fig.add_trace(
-            go.Scatter(
-                x=x,
-                y=y_true,
-                mode="lines+markers",
-                line={"width": 1, "color": "black", "dash": "dash"},
-                opacity=0.6,
-                showlegend=False,
-                xaxis=xaxis2,
-                yaxis=yaxis,
-            )
+        fig.add_scatter(
+            x=x,
+            y=y_true,
+            mode="lines+markers",
+            line={"width": 1, "color": "black", "dash": "dash"},
+            opacity=0.6,
+            showlegend=False,
+            xaxis=xaxis2,
+            yaxis=yaxis,
         )
 
         # Draw horizontal reference line for residuals
@@ -1478,17 +1450,15 @@ def plot_gains(
                     ds, target, method=("decision_function", "predict_proba")
                 )
 
-                fig.add_trace(
-                    self._draw_line(
-                        x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)),
-                        y=(y := np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum()),
-                        mode="lines",
-                        parent=m.name,
-                        child=child,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)),
+                    y=(y := np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum()),
+                    mode="lines",
+                    parent=m.name,
+                    child=child,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         self._draw_straight_line((x, y), y="diagonal", xaxis=xaxis, yaxis=yaxis)
@@ -1601,19 +1571,17 @@ def plot_learning_curve(
                     std[m._group].append(m.bootstrap.loc[:, met].std())
 
             for group in x:
-                fig.add_trace(
-                    self._draw_line(
-                        x=x[group],
-                        y=y[group],
-                        mode="lines+markers",
-                        marker_symbol="circle",
-                        error_y={"type": "data", "array": std[group], "visible": True},
-                        parent=group,
-                        child=self._metric[met].name,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=x[group],
+                    y=y[group],
+                    mode="lines+markers",
+                    marker_symbol="circle",
+                    error_y={"type": "data", "array": std[group], "visible": True},
+                    parent=group,
+                    child=self._metric[met].name,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
                 # Add error bands
@@ -1759,18 +1727,15 @@ def plot_lift(
                     ds, target, method=("decision_function", "predict_proba")
                 )
 
-                gains = np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum()
-                fig.add_trace(
-                    self._draw_line(
-                        x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)),
-                        y=(y := gains / x),
-                        mode="lines",
-                        parent=m.name,
-                        child=child,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)),
+                    y=(y := np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum() / x),
+                    mode="lines",
+                    parent=m.name,
+                    child=child,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         self._draw_straight_line((x, y), y=1, xaxis=xaxis, yaxis=yaxis)
@@ -1943,31 +1908,29 @@ class is always the positive one.
             else:
                 color = BasePlot._fig.get_elem("parshap")
 
-            fig.add_trace(
-                go.Scatter(
-                    x=(x := parshap["train"]),
-                    y=(y := parshap["test"]),
-                    mode="markers+text",
-                    marker={
-                        "color": color,
-                        "size": self.marker_size,
-                        "coloraxis": f"coloraxis{xaxis[1:]}",
-                        "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
-                    },
-                    text=m.branch.features,
-                    textposition="top center",
-                    customdata=(data := None if isinstance(color, str) else list(color)),
-                    hovertemplate=(
-                        f"%{{text}}<br>(%{{x}}, %{{y}})"
-                        f"{'<br>Feature importance: %{customdata:.4f}' if data else ''}"
-                        f"<extra>{m.name}</extra>"
-                    ),
-                    name=m.name,
-                    legendgroup=m.name,
-                    showlegend=BasePlot._fig.showlegend(m.name, legend),
-                    xaxis=xaxis,
-                    yaxis=yaxis,
-                )
+            fig.add_scatter(
+                x=(x := parshap["train"]),
+                y=(y := parshap["test"]),
+                mode="markers+text",
+                marker={
+                    "color": color,
+                    "size": self.marker_size,
+                    "coloraxis": f"coloraxis{xaxis[1:]}",
+                    "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
+                },
+                text=m.branch.features,
+                textposition="top center",
+                customdata=(data := None if isinstance(color, str) else list(color)),
+                hovertemplate=(
+                    f"%{{text}}<br>(%{{x}}, %{{y}})"
+                    f"{'<br>Feature importance: %{customdata:.4f}' if data else ''}"
+                    f"<extra>{m.name}</extra>"
+                ),
+                name=m.name,
+                legendgroup=m.name,
+                showlegend=BasePlot._fig.showlegend(m.name, legend),
+                xaxis=xaxis,
+                yaxis=yaxis,
             )
 
         self._draw_straight_line((x, y), y="diagonal", xaxis=xaxis, yaxis=yaxis)
@@ -2198,18 +2161,16 @@ def plot_partial_dependence(
 
                     # Draw the mean of the individual lines
                     if "average" in kind:
-                        fig.add_trace(
-                            go.Scatter(
-                                x=pred["values"][0],
-                                y=pred["average"][target_c].ravel(),
-                                mode="lines",
-                                line={"width": 2, "color": color},
-                                name=m.name,
-                                legendgroup=m.name,
-                                showlegend=BasePlot._fig.showlegend(m.name, legend),
-                                xaxis=ax[0],
-                                yaxis=axes[0][1],
-                            )
+                        fig.add_scatter(
+                            x=pred["values"][0],
+                            y=pred["average"][target_c].ravel(),
+                            mode="lines",
+                            line={"width": 2, "color": color},
+                            name=m.name,
+                            legendgroup=m.name,
+                            showlegend=BasePlot._fig.showlegend(m.name, legend),
+                            xaxis=ax[0],
+                            yaxis=axes[0][1],
                         )
 
                     # Draw all individual (per sample) lines (ICE)
@@ -2221,42 +2182,38 @@ def plot_partial_dependence(
                             replace=False,
                         )
                         for sample in pred["individual"][target_c, idx, :]:
-                            fig.add_trace(
-                                go.Scatter(
-                                    x=pred["values"][0],
-                                    y=sample,
-                                    mode="lines",
-                                    line={"width": 0.5, "color": color},
-                                    name=m.name,
-                                    legendgroup=m.name,
-                                    showlegend=BasePlot._fig.showlegend(m.name, legend),
-                                    xaxis=ax[0],
-                                    yaxis=axes[0][1],
-                                )
+                            fig.add_scatter(
+                                x=pred["values"][0],
+                                y=sample,
+                                mode="lines",
+                                line={"width": 0.5, "color": color},
+                                name=m.name,
+                                legendgroup=m.name,
+                                showlegend=BasePlot._fig.showlegend(m.name, legend),
+                                xaxis=ax[0],
+                                yaxis=axes[0][1],
                             )
 
                 else:
                     colorscale = PALETTE.get(BasePlot._fig.get_elem(m.name), "Teal")
-                    fig.add_trace(
-                        go.Contour(
-                            x=pred["values"][0],
-                            y=pred["values"][1],
-                            z=pred["average"][target_c],
-                            contours={
-                                "showlabels": True,
-                                "labelfont": {
-                                    "size": self.tick_fontsize,
-                                    "color": "white",
-                                },
+                    fig.add_contour(
+                        x=pred["values"][0],
+                        y=pred["values"][1],
+                        z=pred["average"][target_c],
+                        contours={
+                            "showlabels": True,
+                            "labelfont": {
+                                "size": self.tick_fontsize,
+                                "color": "white",
                             },
-                            hovertemplate="x:%{x}<br>y:%{y}<br>z:%{z}<extra></extra>",
-                            hoverongaps=False,
-                            colorscale=colorscale,
-                            showscale=False,
-                            showlegend=False,
-                            xaxis=ax[0],
-                            yaxis=axes[0][1],
-                        )
+                        },
+                        hovertemplate="x:%{x}<br>y:%{y}<br>z:%{z}<extra></extra>",
+                        hoverongaps=False,
+                        colorscale=colorscale,
+                        showscale=False,
+                        showlegend=False,
+                        xaxis=ax[0],
+                        yaxis=axes[0][1],
                     )
 
                 self._plot(
@@ -2378,19 +2335,17 @@ def plot_permutation_importance(
                 random_state=self.random_state,
             )
 
-            fig.add_trace(
-                go.Box(
-                    x=permutations["importances"].ravel(),
-                    y=list(np.ravel([[fx] * n_repeats for fx in m.branch.features])),
-                    marker_color=BasePlot._fig.get_elem(m.name),
-                    boxpoints="outliers",
-                    orientation="h",
-                    name=m.name,
-                    legendgroup=m.name,
-                    showlegend=BasePlot._fig.showlegend(m.name, legend),
-                    xaxis=xaxis,
-                    yaxis=yaxis,
-                )
+            fig.add_box(
+                x=permutations["importances"].ravel(),
+                y=list(np.ravel([[fx] * n_repeats for fx in m.branch.features])),
+                marker_color=BasePlot._fig.get_elem(m.name),
+                boxpoints="outliers",
+                orientation="h",
+                name=m.name,
+                legendgroup=m.name,
+                showlegend=BasePlot._fig.showlegend(m.name, legend),
+                xaxis=xaxis,
+                yaxis=yaxis,
             )
 
         fig.update_layout(
@@ -2830,17 +2785,15 @@ def plot_prc(
                 # Get precision-recall pairs for different thresholds
                 prec, rec, _ = precision_recall_curve(y_true, y_pred)
 
-                fig.add_trace(
-                    self._draw_line(
-                        x=rec,
-                        y=prec,
-                        mode="lines",
-                        parent=m.name,
-                        child=child,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=rec,
+                    y=prec,
+                    mode="lines",
+                    parent=m.name,
+                    child=child,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         self._draw_straight_line(
@@ -2971,29 +2924,27 @@ def plot_probabilities(
                 else:
                     hist = y_pred.loc[y_true == v, str(cls)]
 
-                fig.add_trace(
-                    go.Scatter(
-                        x=(x := np.linspace(0, 1, 100)),
-                        y=stats.gaussian_kde(hist)(x),
-                        mode="lines",
-                        line={
-                            "width": 2,
-                            "color": BasePlot._fig.get_elem(m.name),
-                            "dash": BasePlot._fig.get_elem(str(v), "dash"),
-                        },
-                        fill="tonexty",
-                        fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)",
-                        fillpattern={"shape": BasePlot._fig.get_elem(str(v), "shape")},
-                        name=f"{col}={v}",
-                        legendgroup=m.name,
-                        legendgrouptitle={
-                            "text": m.name,
-                            "font_size": self.label_fontsize,
-                        },
-                        showlegend=BasePlot._fig.showlegend(f"{m.name}-{v}", legend),
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_sactter(
+                    x=(x := np.linspace(0, 1, 100)),
+                    y=stats.gaussian_kde(hist)(x),
+                    mode="lines",
+                    line={
+                        "width": 2,
+                        "color": BasePlot._fig.get_elem(m.name),
+                        "dash": BasePlot._fig.get_elem(str(v), "dash"),
+                    },
+                    fill="tonexty",
+                    fillcolor=f"rgba{BasePlot._fig.get_elem(m.name)[3:-1]}, 0.2)",
+                    fillpattern={"shape": BasePlot._fig.get_elem(str(v), "shape")},
+                    name=f"{col}={v}",
+                    legendgroup=m.name,
+                    legendgrouptitle={
+                        "text": m.name,
+                        "font_size": self.label_fontsize,
+                    },
+                    showlegend=BasePlot._fig.showlegend(f"{m.name}-{v}", legend),
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         BasePlot._fig.used_models.extend(models_c)
@@ -3112,33 +3063,29 @@ def plot_residuals(
             for child, ds in self._get_set(rows):
                 y_true, y_pred = m._get_pred(ds, target)
 
-                fig.add_trace(
-                    self._draw_line(
-                        x=y_true,
-                        y=(res := np.subtract(y_true, y_pred)),
-                        mode="markers",
-                        parent=m.name,
-                        child=child,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=y_true,
+                    y=(res := np.subtract(y_true, y_pred)),
+                    mode="markers",
+                    parent=m.name,
+                    child=child,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
-                fig.add_trace(
-                    go.Histogram(
-                        y=res,
-                        bingroup="residuals",
-                        marker={
-                            "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
-                            "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
-                        },
-                        name=m.name,
-                        legendgroup=m.name,
-                        showlegend=False,
-                        xaxis=xaxis2,
-                        yaxis=yaxis,
-                    )
+                fig.add_histogram(
+                    y=res,
+                    bingroup="residuals",
+                    marker={
+                        "color": f"rgba({BasePlot._fig.get_elem(m.name)[4:-1]}, 0.2)",
+                        "line": {"width": 2, "color": BasePlot._fig.get_elem(m.name)},
+                    },
+                    name=m.name,
+                    legendgroup=m.name,
+                    showlegend=False,
+                    xaxis=xaxis2,
+                    yaxis=yaxis,
                 )
 
         self._draw_straight_line((y_true, res), y=0, xaxis=xaxis, yaxis=yaxis)
@@ -3287,40 +3234,36 @@ def plot_results(
                         f"can't be mixed with non-time metrics, got {metric_c}."
                     )
 
-                fig.add_trace(
-                    go.Bar(
-                        x=[m.results[met] for m in models_c],
-                        y=[m.name for m in models_c],
-                        orientation="h",
-                        marker={
-                            "color": f"rgba({BasePlot._fig.get_elem(met)[4:-1]}, 0.2)",
-                            "line": {"width": 2, "color": BasePlot._fig.get_elem(met)},
-                        },
-                        hovertemplate=f"%{{x}}<extra>{met}</extra>",
-                        name=met,
-                        legendgroup=met,
-                        showlegend=BasePlot._fig.showlegend(met, legend),
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_bar(
+                    x=[m.results[met] for m in models_c],
+                    y=[m.name for m in models_c],
+                    orientation="h",
+                    marker={
+                        "color": f"rgba({BasePlot._fig.get_elem(met)[4:-1]}, 0.2)",
+                        "line": {"width": 2, "color": BasePlot._fig.get_elem(met)},
+                    },
+                    hovertemplate=f"%{{x}}<extra>{met}</extra>",
+                    name=met,
+                    legendgroup=met,
+                    showlegend=BasePlot._fig.showlegend(met, legend),
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
             else:
-                fig.add_trace(
-                    go.Bar(
-                        x=[m._get_score(met, rows) for m in models_c],
-                        y=[m.name for m in models_c],
-                        orientation="h",
-                        marker={
-                            "color": f"rgba({BasePlot._fig.get_elem(met.name)[4:-1]}, 0.2)",
-                            "line": {"width": 2, "color": BasePlot._fig.get_elem(met.name)},
-                        },
-                        hovertemplate="%{x}<extra></extra>",
-                        name=met.name,
-                        legendgroup=met.name,
-                        showlegend=BasePlot._fig.showlegend(met, legend),
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                fig.add_bar(
+                    x=[m._get_score(met, rows) for m in models_c],
+                    y=[m.name for m in models_c],
+                    orientation="h",
+                    marker={
+                        "color": f"rgba({BasePlot._fig.get_elem(met.name)[4:-1]}, 0.2)",
+                        "line": {"width": 2, "color": BasePlot._fig.get_elem(met.name)},
+                    },
+                    hovertemplate="%{x}<extra></extra>",
+                    name=met.name,
+                    legendgroup=met.name,
+                    showlegend=BasePlot._fig.showlegend(met, legend),
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         fig.update_layout(
@@ -3441,17 +3384,15 @@ def plot_roc(
                     *m._get_pred(ds, target, method=("decision_function", "predict_proba"))
                 )
 
-                fig.add_trace(
-                    self._draw_line(
-                        x=fpr,
-                        y=tpr,
-                        mode="lines",
-                        parent=m.name,
-                        child=child,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=fpr,
+                    y=tpr,
+                    mode="lines",
+                    parent=m.name,
+                    child=child,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         self._draw_straight_line((fpr, tpr), y="diagonal", xaxis=xaxis, yaxis=yaxis)
@@ -3564,24 +3505,21 @@ def plot_successive_halving(
                     std[m._group].append(m.bootstrap.loc[:, met].std())
 
             for group in x:
-                fig.add_trace(
-                    self._draw_line(
-                        x=x[group],
-                        y=y[group],
-                        mode="lines+markers",
-                        marker_symbol="circle",
-                        error_y={"type": "data", "array": std[group], "visible": True},
-                        parent=group,
-                        child=self._metric[met].name,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=x[group],
+                    y=y[group],
+                    mode="lines+markers",
+                    marker_symbol="circle",
+                    error_y={"type": "data", "array": std[group], "visible": True},
+                    parent=group,
+                    child=self._metric[met].name,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
                 # Add error bands
                 if m.bootstrap is not None:
-                    fillcolor = f"rgba{BasePlot._fig.get_elem(group)[3:-1]}, 0.2)"
                     fig.add_traces(
                         [
                             go.Scatter(
@@ -3601,7 +3539,7 @@ def plot_successive_halving(
                                 mode="lines",
                                 line={"width": 1, "color": BasePlot._fig.get_elem(group)},
                                 fill="tonexty",
-                                fillcolor=fillcolor,
+                                fillcolor=f"rgba{BasePlot._fig.get_elem(group)[3:-1]}, 0.2)",
                                 hovertemplate="%{y}<extra>lower bound</extra>",
                                 legendgroup=group,
                                 showlegend=False,
@@ -3743,16 +3681,14 @@ def plot_threshold(
         for m in models_c:
             y_true, y_pred = m._get_pred(rows, target, method="predict_proba")
             for met in metric_c:
-                fig.add_trace(
-                    self._draw_line(
-                        x=(x := np.linspace(0, 1, steps)),
-                        y=[met(y_true, y_pred >= step) for step in x],
-                        parent=m.name,
-                        child=met.__name__,
-                        legend=legend,
-                        xaxis=xaxis,
-                        yaxis=yaxis,
-                    )
+                self._draw_line(
+                    x=(x := np.linspace(0, 1, steps)),
+                    y=[met(y_true, y_pred >= step) for step in x],
+                    parent=m.name,
+                    child=met.__name__,
+                    legend=legend,
+                    xaxis=xaxis,
+                    yaxis=yaxis,
                 )
 
         BasePlot._fig.used_models.extend(models_c)

From 2ae20b4f5e894ab24ceb74ac219b86c5c20850d7 Mon Sep 17 00:00:00 2001
From: Marco van den Boom <rob6874@robeco.nl>
Date: Thu, 25 Jan 2024 15:54:55 +0100
Subject: [PATCH 2/2] added acf, pacf and decomposition plots

---
 atom/atom.py                                 |  18 +-
 atom/data_cleaning.py                        |   7 +-
 atom/plots/basefigure.py                     |   2 +-
 atom/plots/baseplot.py                       |   9 +-
 atom/plots/dataplot.py                       | 487 +++++++++++++++++--
 atom/plots/hyperparametertuningplot.py       |   4 +-
 atom/plots/predictionplot.py                 |  35 +-
 atom/plots/shapplot.py                       |  11 +-
 atom/utils/types.py                          |  15 +
 docs/API/ATOM/atomclassifier/index.html      |   6 +-
 docs/API/ATOM/atomforecaster/index.html      |   6 +-
 docs/API/ATOM/atomregressor/index.html       |   6 +-
 docs/API/plots/plot_parshap/index.html       |   2 +-
 docs/search/search_index.json                |   2 +-
 docs_sources/api/plots/plot_acf.md           |  16 +
 docs_sources/api/plots/plot_decomposition.md |  16 +
 docs_sources/api/plots/plot_pacf.md          |  16 +
 docs_sources/changelog/v5.x.x.md             |  59 ---
 docs_sources/changelog/v6.x.x.md             |  59 +++
 docs_sources/dependencies.md                 |   2 +-
 mkdocs.yml                                   |   3 +
 pyproject.toml                               |   2 +-
 tests/test_plots.py                          |  12 +
 23 files changed, 632 insertions(+), 163 deletions(-)
 create mode 100644 docs_sources/api/plots/plot_acf.md
 create mode 100644 docs_sources/api/plots/plot_decomposition.md
 create mode 100644 docs_sources/api/plots/plot_pacf.md
 create mode 100644 docs_sources/changelog/v6.x.x.md

diff --git a/atom/atom.py b/atom/atom.py
index e6e2fdc89..2095c9c74 100644
--- a/atom/atom.py
+++ b/atom/atom.py
@@ -56,9 +56,9 @@
     FloatZeroToOneInc, Index, IndexSelector, Int, IntLargerEqualZero,
     IntLargerTwo, IntLargerZero, MetricConstructor, ModelsConstructor, NItems,
     NJobs, NormalizerStrats, NumericalStrats, Operators, Pandas, Predictor,
-    PrunerStrats, RowSelector, Scalar, ScalerStrats, Seasonality, Sequence,
-    Series, TargetSelector, Transformer, VectorizerStarts, Verbose, Warnings,
-    XSelector, YSelector, sequence_t,
+    PrunerStrats, RowSelector, Scalar, ScalerStrats, Seasonality,
+    SeasonalityMode, Sequence, Series, TargetSelector, Transformer,
+    VectorizerStarts, Verbose, Warnings, XSelector, YSelector, sequence_t,
 )
 from atom.utils.utils import (
     ClassMap, DataConfig, DataContainer, Goal, adjust_verbosity, bk,
@@ -922,7 +922,7 @@ def shrink(
             Whether to convert all features to sparse format. The value
             that is compressed is the most frequent value in the column.
 
-        columns: int, str, segment, sequence or None, default=None
+        columns: int, str, segment, sequence, dataframe or None, default=None
             [Selection of columns][row-and-column-selection] to shrink. If
             None, transform all columns.
 
@@ -1201,7 +1201,7 @@ def _add_transformer(
             has the `n_jobs` and/or `random_state` parameters, it
             adopts atom's values.
 
-        columns: int, str, segment, sequence or None, default=None
+        columns: int, str, segment, sequence, dataframe or None, default=None
             Columns in the dataset to transform. If None, transform
             all features.
 
@@ -1388,7 +1388,7 @@ def add(
             instance), and it has the `n_jobs` and/or `random_state`
             parameters, it adopts atom's values.
 
-        columns: int, str, segment, sequence or None, default=None
+        columns: int, str, segment, sequence, dataframe or None, default=None
             [Selection of columns][row-and-column-selection] to
             transform. Only select features or the target column, not
             both at the same time (if that happens, the target column
@@ -1564,7 +1564,7 @@ def decompose(
         self,
         *,
         model: str | Predictor | None = None,
-        mode: Literal["additive", "multiplicative"] = "additive",
+        mode: SeasonalityMode = "additive",
         **kwargs,
     ):
         """Detrend and deseasonalize the time series.
@@ -1584,9 +1584,7 @@ def decompose(
             * Use the `columns` parameter to only decompose the target
               column, e.g., `atom.decompose(columns=atom.target)`.
             * Use the [plot_decomposition][] method to visualize the
-              trend, seasonality and residuals of the time series. This
-              can help to determine if the data follows an additive or
-              multiplicative trend.
+              trend, seasonality and residuals of the time series.
 
         """
         columns = kwargs.pop("columns", None)
diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py
index fb4452000..20c0886c6 100644
--- a/atom/data_cleaning.py
+++ b/atom/data_cleaning.py
@@ -47,8 +47,9 @@
     Bins, Bool, CategoricalStrats, DataFrame, DiscretizerStrats, Engine,
     Estimator, FloatLargerZero, IntLargerEqualZero, IntLargerTwo,
     IntLargerZero, NJobs, NormalizerStrats, NumericalStrats, Pandas, Predictor,
-    PrunerStrats, Scalar, ScalerStrats, Sequence, Series, Transformer, Verbose,
-    XSelector, YSelector, dataframe_t, sequence_t, series_t,
+    PrunerStrats, Scalar, ScalerStrats, SeasonalityMode, Sequence, Series,
+    Transformer, Verbose, XSelector, YSelector, dataframe_t, sequence_t,
+    series_t,
 )
 from atom.utils.utils import (
     Goal, bk, composed, crash, get_col_order, get_cols, it, lst, merge,
@@ -1083,7 +1084,7 @@ def __init__(
         *,
         model: str | Predictor | None = None,
         sp: IntLargerZero | None = None,
-        mode: Literal["additive", "multiplicative"] = "additive",
+        mode: SeasonalityMode = "additive",
         n_jobs: NJobs = 1,
         verbose: Verbose = 0,
         logger: str | Path | Logger | None = None,
diff --git a/atom/plots/basefigure.py b/atom/plots/basefigure.py
index f77c973c2..e50d22bb2 100644
--- a/atom/plots/basefigure.py
+++ b/atom/plots/basefigure.py
@@ -190,7 +190,7 @@ def get_elem(
         else:
             return self.style[element].setdefault(name, next(getattr(self, element)))
 
-    def showlegend(self, name: str, legend: Legend | dict | None) -> bool:
+    def showlegend(self, name: str, legend: Legend | dict[str, Any] | None) -> bool:
         """Get whether the trace should be showed in the legend.
 
         If there's already a trace with the same name, it's not
diff --git a/atom/plots/baseplot.py b/atom/plots/baseplot.py
index faa5fbb72..72cdf8774 100644
--- a/atom/plots/baseplot.py
+++ b/atom/plots/baseplot.py
@@ -408,14 +408,16 @@ def _draw_line(
         child: str or None, default=None
             Name of the secondary attribute.
 
-        legend: str, dict or None
+        legend: str, dict or None, default=None
             Legend argument provided by the user.
 
         **kwargs
             Additional keyword arguments for the trace.
 
         """
-        Baseplot._fig.figure.add_scatter(
+        BasePlot._fig.figure.add_scatter(
+            name=kwargs.pop("name", child or parent),
+            mode=kwargs.pop("mode", "lines"),
             line=kwargs.pop(
                 "line", {
                     "width": self.line_width,
@@ -435,7 +437,6 @@ def _draw_line(
                 "hovertemplate",
                 f"(%{{x}}, %{{y}})<extra>{parent}{f' - {child}' if child else ''}</extra>",
             ),
-            name=kwargs.pop("name", child or parent),
             legendgroup=kwargs.pop("legendgroup", parent),
             legendgrouptitle=kwargs.pop(
                 "legendgrouptitle",
@@ -443,7 +444,7 @@ def _draw_line(
             ),
             showlegend=kwargs.pop(
                 "showlegend",
-                BasePlot._fig.showlegend(f"{parent}-{child}", legend)
+                BasePlot._fig.showlegend(f"{parent}-{child}" if child else parent, legend)
             ),
             **kwargs,
         )
diff --git a/atom/plots/dataplot.py b/atom/plots/dataplot.py
index de26e44b5..df5a6e175 100644
--- a/atom/plots/dataplot.py
+++ b/atom/plots/dataplot.py
@@ -10,10 +10,9 @@
 from abc import ABCMeta
 from pathlib import Path
 from typing import Any, Literal
-from statsmodels.tsa.stattools import pacf
+
 import numpy as np
 import pandas as pd
-from sklearn.utils.metaestimators import available_if
 import plotly.graph_objects as go
 from beartype import beartype
 from nltk.collocations import (
@@ -22,15 +21,19 @@
 )
 from scipy import stats
 from sklearn.base import is_classifier
+from sklearn.utils.metaestimators import available_if
+from statsmodels.tsa.seasonal import seasonal_decompose
+from statsmodels.tsa.stattools import acf, pacf
 
 from atom.plots.baseplot import BasePlot
 from atom.utils.constants import PALETTE
 from atom.utils.types import (
-    Bool, ColumnSelector, DataFrame, Int, IntLargerZero, Legend, RowSelector,
-    Segment, Sequence, Series,
+    Bool, ColumnSelector, DataFrame, Int, IntLargerZero, Legend, PACFMethods,
+    RowSelector, SeasonalityMode, Segment, Sequence, Series,
 )
 from atom.utils.utils import (
-    check_dependency, crash, divide, get_corpus, lst, replace_missing, rnd, has_task
+    check_dependency, crash, divide, get_corpus, has_task, lst,
+    replace_missing, rnd,
 )
 
 
@@ -44,6 +47,170 @@ class DataPlot(BasePlot, metaclass=ABCMeta):
 
     """
 
+    @available_if(has_task("forecast"))
+    @crash
+    def plot_acf(
+        self,
+        columns: ColumnSelector | None = None,
+        nlags: IntLargerZero | None = None,
+        *,
+        title: str | dict[str, Any] | None = None,
+        legend: Legend | dict[str, Any] | None = "upper right",
+        figsize: tuple[IntLargerZero, IntLargerZero] | None = None,
+        filename: str | Path | None = None,
+        display: Bool | None = True,
+    ) -> go.Figure | None:
+        """Plot the autocorrelation function.
+
+        The autocorrelation function (ACF) measures the correlation
+        between a time series and lagged versions of itself. It's
+        useful, for example, to identify the order of an autoregressive
+        model. This plot is only available for [forecast][time-series]
+        tasks.
+
+        Parameters
+        ----------
+        columns: int, str, segment, sequence, dataframe or None, default=None
+            Columns to plot the pacf from. If None, it selects the
+            target column.
+
+        nlags: int or None, default=None
+            Number of lags to return autocorrelation for. If None, it
+            uses `min(10 * np.log10(len(y)), len(y) // 2 - 1)`. The
+            returned value includes lag 0 (i.e., 1), so the size of the
+            vector is `(nlags + 1,)`.
+
+        title: str, dict or None, default=None
+            Title for the plot.
+
+            - If None, no title is shown.
+            - If str, text for the title.
+            - If dict, [title configuration][parameters].
+
+        legend: str, dict or None, default="upper right"
+            Legend for the plot. See the [user guide][parameters] for
+            an extended description of the choices.
+
+            - If None: No legend is shown.
+            - If str: Location where to show the legend.
+            - If dict: Legend configuration.
+
+        figsize: tuple or None, default=None
+            Figure's size in pixels, format as (x, y). If None, it
+            adapts the size to the number of lags shown.
+
+        filename: str, Path or None, default=None
+            Save the plot using this name. Use "auto" for automatic
+            naming. The type of the file depends on the provided name
+            (.html, .png, .pdf, etc...). If `filename` has no file type,
+            the plot is saved as html. If None, the plot is not saved.
+
+        display: bool or None, default=True
+            Whether to render the plot. If None, it returns the figure.
+
+        Returns
+        -------
+        [go.Figure][] or None
+            Plot object. Only returned if `display=None`.
+
+        See Also
+        --------
+        atom.plots:DataPlot.plot_acf
+        atom.plots:DataPlot.plot_decomposition
+        atom.plots:DataPlot.plot_ttf
+
+        Examples
+        --------
+        ```pycon
+        from atom import ATOMForecaster
+        from sktime.datasets import load_airline
+
+        y = load_airline()
+
+        atom = ATOMForecaster(y, random_state=1)
+        atom.plot_acf()
+        ```
+
+        """
+        if columns is None:
+            columns_c = lst(self.branch.target)
+        else:
+            columns_c = self.branch._get_columns(columns)
+
+        fig = self._get_figure()
+        xaxis, yaxis = BasePlot._fig.get_axes()
+
+        if nlags is None:
+            nlags = min(int(10 * np.log10(self.branch.shape[0])), self.branch.shape[0] // 2 - 1)
+
+        for col in columns_c:
+            # Returns correlation array and confidence interval
+            corr, conf = acf(self.branch.dataset[col], nlags=nlags, alpha=0.05)
+
+            for pos in (x := np.arange(len(corr))):
+                self._draw_line(
+                    x=(pos, pos),
+                    y=(0, corr[pos]),
+                    parent=col,
+                    hoverinfo="skip",
+                    xaxis=xaxis,
+                    yaxis=yaxis,
+                )
+
+            self._draw_line(
+                x=x,
+                y=corr,
+                parent=col,
+                mode="markers",
+                legend=legend,
+                xaxis=xaxis,
+                yaxis=yaxis,
+            )
+
+            fig.add_traces(
+                [
+                    go.Scatter(
+                        x=x,
+                        y=np.subtract(conf[:, 1], corr),
+                        mode="lines",
+                        line={"width": 1, "color": BasePlot._fig.get_elem(col)},
+                        hovertemplate="%{y}<extra>upper bound</extra>",
+                        legendgroup=col,
+                        showlegend=False,
+                        xaxis=xaxis,
+                        yaxis=yaxis,
+                    ),
+                    go.Scatter(
+                        x=x,
+                        y=np.subtract(conf[:, 0], corr),
+                        mode="lines",
+                        line={"width": 1, "color": BasePlot._fig.get_elem(col)},
+                        fill="tonexty",
+                        fillcolor=f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)",
+                        hovertemplate="%{y}<extra>lower bound</extra>",
+                        legendgroup=col,
+                        showlegend=False,
+                        xaxis=xaxis,
+                        yaxis=yaxis,
+                    ),
+                ]
+            )
+
+        fig.update_yaxes(zerolinecolor="black")
+        fig.update_layout({"hovermode": "x unified"})
+
+        return self._plot(
+            ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+            xlabel="Lag",
+            ylabel="Autocorrelation",
+            title=title,
+            legend=legend,
+            figsize=figsize or (700 + nlags * 10, 600),
+            plotname="plot_acf",
+            filename=filename,
+            display=display,
+        )
+
     @crash
     def plot_components(
         self,
@@ -294,6 +461,178 @@ def plot_correlation(
             display=display,
         )
 
+    @available_if(has_task("forecast"))
+    @crash
+    def plot_decomposition(
+        self,
+        columns: ColumnSelector | None = None,
+        mode: SeasonalityMode = "additive",
+        *,
+        title: str | dict[str, Any] | None = None,
+        legend: Legend | dict[str, Any] | None = "out",
+        figsize: tuple[IntLargerZero, IntLargerZero] = (900, 900),
+        filename: str | Path | None = None,
+        display: Bool | None = True,
+    ) -> go.Figure | None:
+        """Plot the trend, seasonality and residuals of a time series.
+
+        This plot is only available for [forecast][time-series] tasks.
+
+        !!! tip
+            Use atom's [decompose][atomforecaster-decompose] method to
+            remove trend and seasonality from the data.
+
+        Parameters
+        ----------
+        columns: int, str, segment, sequence or dataframe, default=-1
+            [Selection of columns][row-and-column-selection] to plot.
+            If None, the target column is selected.
+
+        mode: str, default="additive"
+            Mode of the decomposition. Choose from:
+
+            - "additive": Assumes the components have a linear relation,
+              i.e., y(t) = level + trend + seasonality + noise.
+            - "multiplicative": Assumes the components have a nonlinear
+              relation, i.e., y(t) = level * trend * seasonality * noise.
+
+        title: str, dict or None, default=None
+            Title for the plot.
+
+            - If None, no title is shown.
+            - If str, text for the title.
+            - If dict, [title configuration][parameters].
+
+        legend: str, dict or None, default="out"
+            Legend for the plot. See the [user guide][parameters] for
+            an extended description of the choices.
+
+            - If None: No legend is shown.
+            - If str: Location where to show the legend.
+            - If dict: Legend configuration.
+
+        figsize: tuple, default=(900, 900)
+            Figure's size in pixels, format as (x, y).
+
+        filename: str, Path or None, default=None
+            Save the plot using this name. Use "auto" for automatic
+            naming. The type of the file depends on the provided name
+            (.html, .png, .pdf, etc...). If `filename` has no file type,
+            the plot is saved as html. If None, the plot is not saved.
+
+        display: bool or None, default=True
+            Whether to render the plot. If None, it returns the figure.
+
+        Returns
+        -------
+        [go.Figure][] or None
+            Plot object. Only returned if `display=None`.
+
+        See Also
+        --------
+        atom.plots:DataPlot.plot_acf
+        atom.plots:DataPlot.plot_pacf
+        atom.plots:DataPlot.plot_series
+
+        Examples
+        --------
+        ```pycon
+        from atom import ATOMForecaster
+        from sktime.datasets import load_airline
+
+        y = load_airline()
+
+        atom = ATOMForecaster(y, random_state=1)
+        atom.plot_decomposition()
+        ```
+
+        """
+        if columns is None:
+            columns_c = lst(self.branch.target)
+        else:
+            columns_c = self.branch._get_columns(columns)
+
+        self._get_figure()
+        xaxis, yaxis = BasePlot._fig.get_axes(y=(0.76, 1.0))
+        xaxis2, yaxis2 = BasePlot._fig.get_axes(y=(0.51, 0.74))
+        xaxis3, yaxis3 = BasePlot._fig.get_axes(y=(0.26, 0.49))
+        xaxis4, yaxis4 = BasePlot._fig.get_axes(y=(0.0, 0.24))
+
+        # Returns correlation array and confidence interval
+        decompose = seasonal_decompose(
+            x=self.branch.dataset[columns_c],
+            model=mode,
+            period=self.sp,
+        )
+
+        for col in columns_c:
+            self._draw_line(
+                x=(x := self._get_plot_index(decompose.trend)),
+                y=decompose.observed,
+                parent=col,
+                child="observed",
+                legend=legend,
+                xaxis=xaxis4,
+                yaxis=yaxis,
+            )
+
+            self._draw_line(
+                x=x,
+                y=decompose.trend,
+                parent=col,
+                child="trend",
+                legend=legend,
+                xaxis=xaxis4,
+                yaxis=yaxis2,
+            )
+
+            self._draw_line(
+                x=x,
+                y=decompose.seasonal,
+                parent=col,
+                child="trend",
+                legend=legend,
+                xaxis=xaxis4,
+                yaxis=yaxis3,
+            )
+
+            self._draw_line(
+                x=x,
+                y=decompose.resid,
+                parent=col,
+                child="trend",
+                legend=legend,
+                xaxis=xaxis4,
+                yaxis=yaxis4,
+            )
+
+        self._plot(
+            ax=(f"xaxis{xaxis2[1:]}", f"yaxis{yaxis2[1:]}"),
+            ylabel="Values",
+        )
+
+        self._plot(
+            ax=(f"xaxis{xaxis3[1:]}", f"yaxis{yaxis3[1:]}"),
+            ylabel="Values",
+        )
+
+        self._plot(
+            ax=(f"xaxis{xaxis4[1:]}", f"yaxis{yaxis4[1:]}"),
+            ylabel="Values",
+        )
+
+        return self._plot(
+            ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
+            xlabel=self.branch.dataset.index.name or "index",
+            ylabel="Values",
+            title=title,
+            legend=legend,
+            figsize=figsize,
+            plotname="plot_acf",
+            filename=filename,
+            display=display,
+        )
+
     @crash
     def plot_distribution(
         self,
@@ -673,21 +1012,22 @@ def get_text(column: Series) -> Series:
     def plot_pacf(
         self,
         columns: ColumnSelector | None = None,
-        show: IntLargerZero | None = 10,
+        nlags: IntLargerZero | None = None,
+        method: PACFMethods = "ywadjusted",
         *,
         title: str | dict[str, Any] | None = None,
-        legend: Legend | dict[str, Any] | None = "lower right",
+        legend: Legend | dict[str, Any] | None = "upper right",
         figsize: tuple[IntLargerZero, IntLargerZero] | None = None,
         filename: str | Path | None = None,
         display: Bool | None = True,
     ) -> go.Figure | None:
         """Plot the partial autocorrelation function.
 
-        Missing values are ignored.
-
-        !!! tip
-            Use atom's [decompose][atomforecaster-decompose] method to
-            remove trend and seasonality from the data.
+        The partial autocorrelation function (PACF) measures the
+        correlation between a time series and lagged versions of
+        itself. It's useful, for example, to identify the order of
+        an autoregressive model. This plot is only available for
+        [forecast][time-series] tasks.
 
         Parameters
         ----------
@@ -695,9 +1035,29 @@ def plot_pacf(
             Columns to plot the pacf from. If None, it selects the
             target column.
 
-        show: int or None, default=10
-            Number of n-grams (ordered by number of occurrences) to
-            show in the plot. If none, show all n-grams (up to 200).
+        nlags: int or None, default=None
+            Number of lags to return autocorrelation for. If None, it
+            uses `min(10 * np.log10(len(y)), len(y) // 2 - 1)`. The
+            returned value includes lag 0 (i.e., 1), so the size of the
+            vector is `(nlags + 1,)`.
+
+        method : str, default="ywadjusted"
+            Specifies which method to use for the calculations.
+
+            - "yw" or "ywadjusted": Yule-Walker with sample-size
+              adjustment in denominator for acovf.
+            - "ywm" or "ywmle": Yule-Walker without adjustment.
+            - "ols" : Regression of time series on lags of it and on
+              constant.
+            - "ols-inefficient": Regression of time series on lags using
+              a single common sample to estimate all pacf coefficients.
+            - "ols-adjusted": Regression of time series on lags with a
+              bias adjustment.
+            - "ld" or "ldadjusted": Levinson-Durbin recursion with bias
+              correction.
+            - "ldb" or "ldbiased": Levinson-Durbin recursion without bias
+              correction.
+            - "burg":  Burg"s partial autocorrelation estimator.
 
         title: str, dict or None, default=None
             Title for the plot.
@@ -706,7 +1066,7 @@ def plot_pacf(
             - If str, text for the title.
             - If dict, [title configuration][parameters].
 
-        legend: str, dict or None, default="lower right"
+        legend: str, dict or None, default="upper right"
             Legend for the plot. See the [user guide][parameters] for
             an extended description of the choices.
 
@@ -716,7 +1076,7 @@ def plot_pacf(
 
         figsize: tuple or None, default=None
             Figure's size in pixels, format as (x, y). If None, it
-            adapts the size to the number of n-grams shown.
+            adapts the size to the number of lags shown.
 
         filename: str, Path or None, default=None
             Save the plot using this name. Use "auto" for automatic
@@ -755,39 +1115,76 @@ def plot_pacf(
             columns_c = lst(self.branch.target)
         else:
             columns_c = self.branch._get_columns(columns)
-        show_c = self._get_show(show)
 
         fig = self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
+        if nlags is None:
+            nlags = min(int(10 * np.log10(self.branch.shape[0])), self.branch.shape[0] // 2 - 1)
+
         for col in columns_c:
-            corr_array = pacf(self.branch.dataset[col].dropna(), nlags=10, alpha=0.05)
+            # Returns correlation array and confidence interval
+            corr, conf = pacf(self.branch.dataset[col], nlags=nlags, method=method, alpha=0.05)
 
-            lower_y = corr_array[1][:, 0] - corr_array[0]
-            upper_y = corr_array[1][:, 1] - corr_array[0]
+            for pos in (x := np.arange(len(corr))):
+                self._draw_line(
+                    x=(pos, pos),
+                    y=(0, corr[pos]),
+                    parent=col,
+                    hoverinfo="skip",
+                    xaxis=xaxis,
+                    yaxis=yaxis,
+                )
 
-            for x in range(len(corr_array[0])):
-                fig.add_scatter(x=(x, x), y=(0, corr_array[0][x]), mode='lines', line_color='#3f3f3f', xaxis=xaxis, yaxis=yaxis)
+            self._draw_line(
+                x=x,
+                y=corr,
+                parent=col,
+                mode="markers",
+                legend=legend,
+                xaxis=xaxis,
+                yaxis=yaxis,
+            )
 
-            fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers',
-                            marker_color='#1f77b4',
-                            marker_size=12, xaxis=xaxis, yaxis=yaxis)
-            fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines',
-                            line_color='rgba(255,255,255,0)', xaxis=xaxis, yaxis=yaxis)
-            fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',
-                            fillcolor='rgba(32, 146, 230,0.3)',
-                            fill='tonexty', line_color='rgba(255,255,255,0)', xaxis=xaxis, yaxis=yaxis)
+            fig.add_traces(
+                [
+                    go.Scatter(
+                        x=x,
+                        y=np.subtract(conf[:, 1], corr),
+                        mode="lines",
+                        line={"width": 1, "color": BasePlot._fig.get_elem(col)},
+                        hovertemplate="%{y}<extra>upper bound</extra>",
+                        legendgroup=col,
+                        showlegend=False,
+                        xaxis=xaxis,
+                        yaxis=yaxis,
+                    ),
+                    go.Scatter(
+                        x=x,
+                        y=np.subtract(conf[:, 0], corr),
+                        mode="lines",
+                        line={"width": 1, "color": BasePlot._fig.get_elem(col)},
+                        fill="tonexty",
+                        fillcolor=f"rgba({BasePlot._fig.get_elem(col)[4:-1]}, 0.2)",
+                        hovertemplate="%{y}<extra>lower bound</extra>",
+                        legendgroup=col,
+                        showlegend=False,
+                        xaxis=xaxis,
+                        yaxis=yaxis,
+                    ),
+                ]
+            )
 
-            fig.update_traces(showlegend=False)
-            # fig.update_xaxes(range=[-1, 42])
-            fig.update_yaxes(zerolinecolor="black")
+        fig.update_yaxes(zerolinecolor="black")
+        fig.update_layout({"hovermode": "x unified"})
 
         return self._plot(
             ax=(f"xaxis{xaxis[1:]}", f"yaxis{yaxis[1:]}"),
             xlabel="Lag",
+            ylabel="Partial autocorrelation",
             title=title,
             legend=legend,
-            figsize=figsize or (900, 400 + show_c * 50),
+            figsize=figsize or (700 + nlags * 10, 600),
             plotname="plot_pacf",
             filename=filename,
             display=display,
@@ -994,7 +1391,7 @@ def plot_qq(
         """
         columns_c = self.branch._get_columns(columns)
 
-        fig = self._get_figure()
+        self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
         percentiles = np.linspace(0, 100, 101)
@@ -1285,20 +1682,18 @@ def plot_rfecv(
         mean = self.rfecv_.cv_results_["mean_test_score"]
         std = self.rfecv_.cv_results_["std_test_score"]
 
-        fig.add_scatter(
+        self._draw_line(
             x=list(x),
             y=mean,
+            parent="rfecv",
+            name=ylabel,
             mode="lines+markers",
-            line={"width": self.line_width, "color": BasePlot._fig.get_elem("rfecv")},
             marker={
                 "symbol": symbols,
                 "size": sizes,
                 "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
                 "opacity": 1,
             },
-            name=ylabel,
-            legendgroup="rfecv",
-            showlegend=BasePlot._fig.showlegend("rfecv", legend),
             xaxis=xaxis,
             yaxis=yaxis,
         )
@@ -1350,6 +1745,7 @@ def plot_rfecv(
             display=display,
         )
 
+    @available_if(has_task("forecast"))
     @crash
     def plot_series(
         self,
@@ -1364,8 +1760,7 @@ def plot_series(
     ) -> go.Figure | None:
         """Plot a data series.
 
-        This plot is specially useful to plot the time series for
-        [forecast][time-series] tasks.
+        This plot is only available for [forecast][time-series] tasks.
 
         Parameters
         ----------
@@ -1437,7 +1832,7 @@ def plot_series(
         else:
             columns_c = self.branch._get_columns(columns, include_target=True)
 
-        fig = self._get_figure()
+        self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
         for col in columns_c:
@@ -1445,14 +1840,14 @@ def plot_series(
                 self._draw_line(
                     x=self._get_plot_index(y := self.branch._get_rows(ds)[col]),
                     y=y,
+                    parent=col,
+                    child=child,
                     mode="lines+markers",
                     marker={
                         "size": self.marker_size,
                         "color": BasePlot._fig.get_elem(col),
                         "line": {"width": 1, "color": "rgba(255, 255, 255, 0.9)"},
                     },
-                    parent=col,
-                    child=child,
                     legend=legend,
                     xaxis=xaxis,
                     yaxis=yaxis,
diff --git a/atom/plots/hyperparametertuningplot.py b/atom/plots/hyperparametertuningplot.py
index e520a6e03..5b808badc 100644
--- a/atom/plots/hyperparametertuningplot.py
+++ b/atom/plots/hyperparametertuningplot.py
@@ -248,7 +248,7 @@ def plot_edf(
         x_max = bk.concat([m.trials[metric_c] for m in models_c]).max(axis=None)
         x = np.linspace(x_min, x_max, 100)
 
-        fig = self._get_figure()
+        self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
         for m in models_c:
@@ -1178,7 +1178,7 @@ def plot_terminator_improvement(
         models_c = self._get_plot_models(models, ensembles=False)
         models_c = self._check_hyperparams(models_c)
 
-        fig = self._get_figure()
+        self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
         for m in models_c:
diff --git a/atom/plots/predictionplot.py b/atom/plots/predictionplot.py
index 69f5fcd25..af0a13b34 100644
--- a/atom/plots/predictionplot.py
+++ b/atom/plots/predictionplot.py
@@ -687,7 +687,7 @@ def plot_det(
         """
         models_c = self._get_plot_models(models)
 
-        fig = self._get_figure()
+        self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
         for m in models_c:
@@ -700,7 +700,6 @@ def plot_det(
                 self._draw_line(
                     x=fpr,
                     y=fnr,
-                    mode="lines",
                     parent=m.name,
                     child=child,
                     legend=legend,
@@ -812,7 +811,7 @@ def plot_errors(
         """
         models_c = self._get_plot_models(models)
 
-        fig = self._get_figure()
+        self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
         for m in models_c:
@@ -839,7 +838,6 @@ def plot_errors(
                 self._draw_line(
                     x=(x := np.linspace(y_true.min(), y_true.max(), 100)),
                     y=estimator.predict(x[:, np.newaxis]),
-                    mode="lines",
                     hovertemplate="(%{x}, %{y})<extra></extra>",
                     parent=m.name,
                     legend=None,
@@ -943,7 +941,7 @@ def plot_evals(
         """
         models_c = self._get_plot_models(models, ensembles=False)
 
-        fig = self._get_figure()
+        self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
         for m in models_c:
@@ -1441,7 +1439,7 @@ def plot_gains(
         """
         models_c = self._get_plot_models(models)
 
-        fig = self._get_figure()
+        self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
         for m in models_c:
@@ -1453,7 +1451,6 @@ def plot_gains(
                 self._draw_line(
                     x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)),
                     y=(y := np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum()),
-                    mode="lines",
                     parent=m.name,
                     child=child,
                     legend=legend,
@@ -1718,7 +1715,7 @@ def plot_lift(
         """
         models_c = self._get_plot_models(models)
 
-        fig = self._get_figure()
+        self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
         for m in models_c:
@@ -1730,7 +1727,6 @@ def plot_lift(
                 self._draw_line(
                     x=(x := np.arange(start=1, stop=len(y_true) + 1) / len(y_true)),
                     y=(y := np.cumsum(y_true.iloc[np.argsort(y_pred)[::-1]]) / y_true.sum() / x),
-                    mode="lines",
                     parent=m.name,
                     child=child,
                     legend=legend,
@@ -1785,8 +1781,9 @@ def plot_parshap(
         models: int, str, Model, segment, sequence or None, default=None
             Models to plot. If None, all models are selected.
 
-        columns: int, str, segment, sequence or None, default=None
-            XSelector to plot. If None, it plots all features.
+        columns: int, str, segment, sequence, dataframe or None, default=None
+            [Feature set][row-and-column-selection] to plot. If None,
+            it selects all features.
 
         target: int, str or tuple, default=1
             Class in the target column to target. For multioutput tasks,
@@ -1984,9 +1981,9 @@ def plot_partial_dependence(
         models: int, str, Model, segment, sequence or None, default=None
             Models to plot. If None, all models are selected.
 
-        columns: int, str, segment, sequence, dataframe, default=(0, 1, 2)
-            [XSelector][row-and-column-selection] to get the partial
-            dependence from.
+        columns: int, str, segment, sequence or dataframe, default=(0, 1, 2)
+            [Feature set][row-and-column-selection] to get the
+            partial dependence from.
 
         kind: str or sequence, default="average"
             Kind of dependence to plot. Use a sequence or add `+` between
@@ -2773,7 +2770,7 @@ def plot_prc(
         """
         models_c = self._get_plot_models(models)
 
-        fig = self._get_figure()
+        self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
         for m in models_c:
@@ -2788,7 +2785,6 @@ def plot_prc(
                 self._draw_line(
                     x=rec,
                     y=prec,
-                    mode="lines",
                     parent=m.name,
                     child=child,
                     legend=legend,
@@ -2924,7 +2920,7 @@ def plot_probabilities(
                 else:
                     hist = y_pred.loc[y_true == v, str(cls)]
 
-                fig.add_sactter(
+                fig.add_scatter(
                     x=(x := np.linspace(0, 1, 100)),
                     y=stats.gaussian_kde(hist)(x),
                     mode="lines",
@@ -3374,7 +3370,7 @@ def plot_roc(
         """
         models_c = self._get_plot_models(models)
 
-        fig = self._get_figure()
+        self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
         for m in models_c:
@@ -3387,7 +3383,6 @@ def plot_roc(
                 self._draw_line(
                     x=fpr,
                     y=tpr,
-                    mode="lines",
                     parent=m.name,
                     child=child,
                     legend=legend,
@@ -3675,7 +3670,7 @@ def plot_threshold(
                     metric_c.append(m)
             metric_c = [get_custom_scorer(m)._score_func for m in metric_c]
 
-        fig = self._get_figure()
+        self._get_figure()
         xaxis, yaxis = BasePlot._fig.get_axes()
 
         for m in models_c:
diff --git a/atom/plots/shapplot.py b/atom/plots/shapplot.py
index 0242f8788..7c5bc10d3 100644
--- a/atom/plots/shapplot.py
+++ b/atom/plots/shapplot.py
@@ -20,8 +20,8 @@
 
 from atom.plots.baseplot import BasePlot
 from atom.utils.types import (
-    Bool, Int, IntLargerZero, Legend, ModelSelector, RowSelector,
-    TargetsSelector,
+    Bool, ColumnSelector, Int, IntLargerZero, Legend, ModelSelector,
+    RowSelector, TargetsSelector,
 )
 from atom.utils.utils import check_canvas, crash, has_task
 
@@ -645,7 +645,7 @@ def plot_shap_scatter(
         self,
         models: ModelSelector | None = None,
         rows: RowSelector = "test",
-        columns: Int | str = 0,
+        columns: ColumnSelector = 0,
         target: TargetsSelector = 1,
         *,
         title: str | dict[str, Any] | None = None,
@@ -676,8 +676,9 @@ def plot_shap_scatter(
             plot_shap_scatter method does not support plotting a single
             sample.
 
-        columns: int or str, default=0
-            Column to plot.
+        columns: int, str, segment, sequence or dataframe, default=0
+            [Feature][row-and-column-selection] to plot. Only one
+            column can be selected.
 
         target: int, str or tuple, default=1
             Class in the target column to target. For multioutput tasks,
diff --git a/atom/utils/types.py b/atom/utils/types.py
index d1075aa27..70bcd49f3 100644
--- a/atom/utils/types.py
+++ b/atom/utils/types.py
@@ -284,8 +284,23 @@ def predict(self, *args, **kwargs) -> Pandas: ...
 
 # Others
 Seasonality: TypeAlias = IntLargerOne | str | Sequence[IntLargerOne | str] | None
+SeasonalityMode: TypeAlias = Literal["additive", "multiplicative"]
 HarmonicsSelector: TypeAlias = Literal["drop", "raw_strength", "harmonic_strength"]
 Stages: TypeAlias = Literal["None", "Staging", "Production", "Archived"]
+PACFMethods: TypeAlias = Literal[
+    "yw",
+    "ywadjusted",
+    "ywm",
+    "ywmle",
+    "ols",
+    "ols-inefficient",
+    "ols-adjusted",
+    "ld",
+    "ldadjusted",
+    "ldb",
+    "ldbiased",
+    "burg",
+]
 NItems: TypeAlias = (
     IntLargerEqualZero
     | dict[str, IntLargerEqualZero]
diff --git a/docs/API/ATOM/atomclassifier/index.html b/docs/API/ATOM/atomclassifier/index.html
index d4056742b..734fe884f 100644
--- a/docs/API/ATOM/atomclassifier/index.html
+++ b/docs/API/ATOM/atomclassifier/index.html
@@ -5177,7 +5177,7 @@ <h2 id="utility-methods">Utility methods</h2>
 <p><tr><td class='td_title'><strong>Parameters</strong></td><td class='td_params'><a id='add-transformer'></a><strong>transformer: Transformer</strong><br><div markdown class='param'>
 Estimator to add to the pipeline. Should implement a
 <code>transform</code> method.</p>
-<p></div><a id='add-columns'></a><strong>columns: int, str, segment, sequence or None, default=None</strong><br><div markdown class='param'>
+<p></div><a id='add-columns'></a><strong>columns: int, str, segment, sequence, dataframe or None, default=None</strong><br><div markdown class='param'>
 <a class="autorefs autorefs-internal" href="../../../user_guide/data_management/#row-and-column-selection">Selection of columns</a> to
 transform. Only select features or the target column, not
 both at the same time (if that happens, the target column
@@ -5323,7 +5323,7 @@ <h2 id="utility-methods">Utility methods</h2>
 Names of the distributions in <code>scipy.stats</code> to get the
 statistics on. If None, a selection of the most common
 ones is used.</p>
-<p></div><a id='distribution-columns'></a><strong>columns: int, str, segment, sequence or None, default=None</strong><br><div markdown class='param'>
+<p></div><a id='distribution-columns'></a><strong>columns: int, str, segment, sequence, dataframe or None, default=None</strong><br><div markdown class='param'>
 <a class="autorefs autorefs-internal" href="../../../user_guide/data_management/#row-and-column-selection">Selection of columns</a> to perform
 the test on. If None, select all numerical columns.</p>
 <p></div></td></tr><tr><td class='td_title'><strong>Returns</strong></td><td class='td_params'><a id='distribution-pd.DataFrame'></a><strong>pd.DataFrame</strong><br><div markdown class='param'>
@@ -5602,7 +5602,7 @@ <h2 id="utility-methods">Utility methods</h2>
 <p></div><a id='shrink-dense2sparse'></a><strong>dense2sparse: bool, default=False</strong><br><div markdown class='param'>
 Whether to convert all features to sparse format. The value
 that is compressed is the most frequent value in the column.</p>
-<p></div><a id='shrink-columns'></a><strong>columns: int, str, segment, sequence or None, default=None</strong><br><div markdown class='param'>
+<p></div><a id='shrink-columns'></a><strong>columns: int, str, segment, sequence, dataframe or None, default=None</strong><br><div markdown class='param'>
 <a class="autorefs autorefs-internal" href="../../../user_guide/data_management/#row-and-column-selection">Selection of columns</a> to shrink. If
 None, transform all columns.
 </div></td></tr></p>
diff --git a/docs/API/ATOM/atomforecaster/index.html b/docs/API/ATOM/atomforecaster/index.html
index 56ffb019c..4d64fb98b 100644
--- a/docs/API/ATOM/atomforecaster/index.html
+++ b/docs/API/ATOM/atomforecaster/index.html
@@ -5133,7 +5133,7 @@ <h2 id="utility-methods">Utility methods</h2>
 <p><tr><td class='td_title'><strong>Parameters</strong></td><td class='td_params'><a id='add-transformer'></a><strong>transformer: Transformer</strong><br><div markdown class='param'>
 Estimator to add to the pipeline. Should implement a
 <code>transform</code> method.</p>
-<p></div><a id='add-columns'></a><strong>columns: int, str, segment, sequence or None, default=None</strong><br><div markdown class='param'>
+<p></div><a id='add-columns'></a><strong>columns: int, str, segment, sequence, dataframe or None, default=None</strong><br><div markdown class='param'>
 <a class="autorefs autorefs-internal" href="../../../user_guide/data_management/#row-and-column-selection">Selection of columns</a> to
 transform. Only select features or the target column, not
 both at the same time (if that happens, the target column
@@ -5279,7 +5279,7 @@ <h2 id="utility-methods">Utility methods</h2>
 Names of the distributions in <code>scipy.stats</code> to get the
 statistics on. If None, a selection of the most common
 ones is used.</p>
-<p></div><a id='distribution-columns'></a><strong>columns: int, str, segment, sequence or None, default=None</strong><br><div markdown class='param'>
+<p></div><a id='distribution-columns'></a><strong>columns: int, str, segment, sequence, dataframe or None, default=None</strong><br><div markdown class='param'>
 <a class="autorefs autorefs-internal" href="../../../user_guide/data_management/#row-and-column-selection">Selection of columns</a> to perform
 the test on. If None, select all numerical columns.</p>
 <p></div></td></tr><tr><td class='td_title'><strong>Returns</strong></td><td class='td_params'><a id='distribution-pd.DataFrame'></a><strong>pd.DataFrame</strong><br><div markdown class='param'>
@@ -5558,7 +5558,7 @@ <h2 id="utility-methods">Utility methods</h2>
 <p></div><a id='shrink-dense2sparse'></a><strong>dense2sparse: bool, default=False</strong><br><div markdown class='param'>
 Whether to convert all features to sparse format. The value
 that is compressed is the most frequent value in the column.</p>
-<p></div><a id='shrink-columns'></a><strong>columns: int, str, segment, sequence or None, default=None</strong><br><div markdown class='param'>
+<p></div><a id='shrink-columns'></a><strong>columns: int, str, segment, sequence, dataframe or None, default=None</strong><br><div markdown class='param'>
 <a class="autorefs autorefs-internal" href="../../../user_guide/data_management/#row-and-column-selection">Selection of columns</a> to shrink. If
 None, transform all columns.
 </div></td></tr></p>
diff --git a/docs/API/ATOM/atomregressor/index.html b/docs/API/ATOM/atomregressor/index.html
index c524785d9..14369d84b 100644
--- a/docs/API/ATOM/atomregressor/index.html
+++ b/docs/API/ATOM/atomregressor/index.html
@@ -5154,7 +5154,7 @@ <h2 id="utility-methods">Utility methods</h2>
 <p><tr><td class='td_title'><strong>Parameters</strong></td><td class='td_params'><a id='add-transformer'></a><strong>transformer: Transformer</strong><br><div markdown class='param'>
 Estimator to add to the pipeline. Should implement a
 <code>transform</code> method.</p>
-<p></div><a id='add-columns'></a><strong>columns: int, str, segment, sequence or None, default=None</strong><br><div markdown class='param'>
+<p></div><a id='add-columns'></a><strong>columns: int, str, segment, sequence, dataframe or None, default=None</strong><br><div markdown class='param'>
 <a class="autorefs autorefs-internal" href="../../../user_guide/data_management/#row-and-column-selection">Selection of columns</a> to
 transform. Only select features or the target column, not
 both at the same time (if that happens, the target column
@@ -5300,7 +5300,7 @@ <h2 id="utility-methods">Utility methods</h2>
 Names of the distributions in <code>scipy.stats</code> to get the
 statistics on. If None, a selection of the most common
 ones is used.</p>
-<p></div><a id='distribution-columns'></a><strong>columns: int, str, segment, sequence or None, default=None</strong><br><div markdown class='param'>
+<p></div><a id='distribution-columns'></a><strong>columns: int, str, segment, sequence, dataframe or None, default=None</strong><br><div markdown class='param'>
 <a class="autorefs autorefs-internal" href="../../../user_guide/data_management/#row-and-column-selection">Selection of columns</a> to perform
 the test on. If None, select all numerical columns.</p>
 <p></div></td></tr><tr><td class='td_title'><strong>Returns</strong></td><td class='td_params'><a id='distribution-pd.DataFrame'></a><strong>pd.DataFrame</strong><br><div markdown class='param'>
@@ -5579,7 +5579,7 @@ <h2 id="utility-methods">Utility methods</h2>
 <p></div><a id='shrink-dense2sparse'></a><strong>dense2sparse: bool, default=False</strong><br><div markdown class='param'>
 Whether to convert all features to sparse format. The value
 that is compressed is the most frequent value in the column.</p>
-<p></div><a id='shrink-columns'></a><strong>columns: int, str, segment, sequence or None, default=None</strong><br><div markdown class='param'>
+<p></div><a id='shrink-columns'></a><strong>columns: int, str, segment, sequence, dataframe or None, default=None</strong><br><div markdown class='param'>
 <a class="autorefs autorefs-internal" href="../../../user_guide/data_management/#row-and-column-selection">Selection of columns</a> to shrink. If
 None, transform all columns.
 </div></td></tr></p>
diff --git a/docs/API/plots/plot_parshap/index.html b/docs/API/plots/plot_parshap/index.html
index 726e50470..879472e61 100644
--- a/docs/API/plots/plot_parshap/index.html
+++ b/docs/API/plots/plot_parshap/index.html
@@ -4543,7 +4543,7 @@ <h1 id="plot_parshap">plot_parshap</h1>
 <table class="table_params">
 <p><tr><td class='td_title'><strong>Parameters</strong></td><td class='td_params'><a id='plot_parshap-models'></a><strong>models: int, str, Model, segment, sequence or None, default=None</strong><br><div markdown class='param'>
 Models to plot. If None, all models are selected.</p>
-<p></div><a id='plot_parshap-columns'></a><strong>columns: int, str, segment, sequence or None, default=None</strong><br><div markdown class='param'>
+<p></div><a id='plot_parshap-columns'></a><strong>columns: int, str, segment, sequence, dataframe or None, default=None</strong><br><div markdown class='param'>
 XSelector to plot. If None, it plots all features.</p>
 <p></div><a id='plot_parshap-target'></a><strong>target: int, str or tuple, default=1</strong><br><div markdown class='param'>
 Class in the target column to target. For multioutput tasks,
diff --git a/docs/search/search_index.json b/docs/search/search_index.json
index 42381e4ba..4fd8e276d 100644
--- a/docs/search/search_index.json
+++ b/docs/search/search_index.json
@@ -1 +1 @@
-{"config": {"lang": ["en"], "separator": "[\\s\\-]+", "pipeline": ["stopWordFilter"]}, "docs": [{"location": "about/", "title": "About", "text": ""}, {"location": "about/#what-is-it", "title": "What is it?", "text": "<p>Automated Tool for Optimized Modeling (ATOM) is an open-source Python package designed to help data scientists fasten up the exploration phase of their machine learning projects. ATOM is a low-code, easy-to-use library, capable of running experiments quickly and efficiently, enabling the user to go from raw data to generating insights in just a few lines of code. Click here to get started.</p> <p></p>"}, {"location": "about/#what-can-i-do-with-it", "title": "What can I do with it?", "text": "<p>ATOM is an end-to-end solution for machine learning pipelines. It supports the user from raw data ingestion to the final results' analysis and model deployment. Click on the icons to read more about its main functionalities.</p> Data cleaning Feature engineering Model selection Hyperparametertuning Model training Model predictions Experiment logging Analysis &amp;Interpretability"}, {"location": "about/#who-is-it-intended-for", "title": "Who is it intended for?", "text": "<ul> <li>Data scientists that want to fasten up the exploration phase of their machine   learning projects.</li> <li>Data scientists that want to run a simple modeling experiment without having   to spend too much time on coding.</li> <li>Data scientists that are new to Python and are not (yet) familiar with all   the relevant machine learning packages.</li> <li>Data analysts without extensive knowledge of machine learning that want to   try out model-based solutions.</li> <li>Anyone who wants to rapidly build a Proof of Concept, for example during a hackathon.</li> <li>Anyone who is new to the field of machine learning and wants a low-code,   easy to learn package, to get started building predictive pipelines.</li> </ul>"}, {"location": "about/#citing-atom", "title": "Citing ATOM", "text": "<p>If you use ATOM in a scientific publication, please consider citing this documentation page as the resource. ATOM\u2019s first stable release v2.0.3 was made publicly available in November 2019. A formatted version of the citation would look like this:</p> <p>ATOM v2.0.3, November 2019. URL https://tvdboom.github.io/ATOM/</p> <p>BibTeX entry:</p> <pre><code>@Manual{ATOM,\n    title = {ATOM: A Python package for fast exploration of machine learning pipelines},\n    author = {Mavs},\n    year={2019},\n    mont={November},\n    note = {ATOM version 2.0.3},\n    url = {https://tvdboom.github.io/ATOM/},\n}\n</code></pre> <p></p>"}, {"location": "about/#support", "title": "Support", "text": "<p>ATOM recognizes the support from JetBrains by providing core project contributors with a set of developer tools free of charge.</p> <p> </p> <p></p>"}, {"location": "about/#integrations", "title": "Integrations", "text": ""}, {"location": "contributing/", "title": "Contributing", "text": "<p>Are you interested in contributing to ATOM? Do you want to report a bug? Do you have a question? Before you do, please read the following guidelines.</p> <p></p>"}, {"location": "contributing/#submission-context", "title": "Submission context", "text": ""}, {"location": "contributing/#question-or-problem", "title": "Question or problem?", "text": "<p>For quick questions, there's no need to open an issue. Check first if the question isn't already answered in the FAQ section. If not, reach us through the discussions page or on the slack channel.</p>"}, {"location": "contributing/#report-a-bug", "title": "Report a bug?", "text": "<p>If you found a bug in the source code, you can help by submitting an issue to the issue tracker in the GitHub repository. Even better, you can submit a Pull Request with a fix. However, before doing so, please read the submission guidelines.</p>"}, {"location": "contributing/#missing-a-feature", "title": "Missing a feature?", "text": "<p>You can request a new feature by submitting an issue to the GitHub Repository. If you would like to implement a new feature, please submit an issue with a proposal for your work first. Please consider what kind of change it is:</p> <ul> <li> <p>For a major feature, first open an issue and outline your proposal so   that it can be discussed. This will also allow us to better coordinate our   efforts, prevent duplication of work, and help you to craft the change so   that it is successfully accepted into the project.</p> </li> <li> <p>Small features and bugs can be crafted and directly submitted as a Pull   Request. However, there is no guarantee that your feature will make it into   <code>master</code>, as it's always a matter of opinion whether if benefits the   overall functionality of the project.</p> </li> </ul>"}, {"location": "contributing/#project-layout", "title": "Project layout", "text": "<p>The latest stable release of ATOM is on the <code>master</code> branch, whereas the latest version of ATOM in development is on the <code>development</code> branch. Make sure you are looking at and working on the correct branch if you're looking to contribute code.</p> <p>In terms of directory structure:</p> <ul> <li>All of ATOM's code sources are in the <code>atom</code> directory.</li> <li>The documentation sources are in the <code>docs_sources</code> directory.</li> <li>Images in the documentation are in the <code>docs_sources/img</code> directory.</li> <li>Tutorial notebooks are in the <code>examples</code> directory. If you want to   include the example to the documentation as well, add the <code>.ipynb</code> file   to <code>docs_sources/examples</code> and update the <code>mkdocs.yml</code> file accordingly.</li> <li>Unit tests are in the <code>tests</code> directory. Make sure to add the tests to the   file corresponding to the module in the <code>atom</code> directory with the code that   is being tested.</li> </ul> <p>Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the <code>development</code> branch.</p> <p></p>"}, {"location": "contributing/#submission-guidelines", "title": "Submission guidelines", "text": ""}, {"location": "contributing/#submitting-an-issue", "title": "Submitting an issue", "text": "<p>Before you submit an issue, please search the issue tracker, maybe an issue for your problem already exists, and the discussion might inform you of workarounds readily available.</p> <p>We want to fix all the issues as soon as possible, but before fixing a bug we need to reproduce and confirm it. In order to reproduce bugs we will systematically ask you to provide a minimal reproduction scenario using the custom issue template.</p>"}, {"location": "contributing/#submitting-a-pull-request", "title": "Submitting a pull request", "text": "<p>Before you submit a pull request, please work through this checklist to make sure that you have done the necessary so we can efficiently review and accept your changes.</p> <ul> <li>Update the documentation so all of your changes are reflected there.</li> <li>Adhere to PEP 8 standards.</li> <li>Use a maximum of 91 characters per line. Try to keep docstrings below   74 characters.</li> <li>Update the project unit tests to test your code changes as thoroughly   as possible.</li> <li>Make sure that your code is properly commented with docstrings and   comments explaining your rationale behind non-obvious coding practices.</li> <li>Run isort: <code>isort atom tests</code>.</li> <li>Run flake8: <code>flake8 --show-source --statistics atom tests</code>.</li> <li>Run pydocstyle: <code>pydocstyle atom tests</code>.</li> <li>Run mypy: <code>mypy atom tests</code>.</li> </ul> <p>If your contribution requires a new library dependency:</p> <ul> <li>Double-check that the new dependency is easy to install via pip and Anaconda.</li> <li>The library should support Python 3.10 and 3.11.</li> <li>Make sure the code works with the latest version of the library.</li> <li>Update the dependencies in the documentation.</li> <li>Add the library with the minimum required version to <code>pyproject.toml</code>.</li> </ul> <p>After submitting your pull request, GitHub will automatically run the tests on your changes and make sure that the updated code builds successfully. The checks run on Python 3.10 and 3.11, on Ubuntu and Windows. We also use services that automatically check code style and test coverage.</p>"}, {"location": "dependencies/", "title": "Dependencies", "text": ""}, {"location": "dependencies/#python-os", "title": "Python &amp; OS", "text": "<p>As of the moment, ATOM supports the following Python versions:</p> <ul> <li>Python 3.10</li> <li>Python 3.11</li> </ul> <p>And operating systems:</p> <ul> <li>Linux (Ubuntu, Fedora, etc...)</li> <li>Windows 8.1+</li> <li>macOS (not tested)</li> </ul> <p></p>"}, {"location": "dependencies/#packages", "title": "Packages", "text": ""}, {"location": "dependencies/#required", "title": "Required", "text": "<p>ATOM is built on top of several existing Python libraries. These packages are necessary for its correct functioning.</p> <ul> <li>beartype (&gt;=0.16.4)</li> <li>category-encoders (&gt;=2.6.3)</li> <li>dagshub (&gt;=0.3.8)</li> <li>dill (&gt;=0.3.6)</li> <li>gplearn (&gt;=0.4.2)</li> <li>imbalanced-learn (&gt;=0.11.0)</li> <li>ipython (&gt;=8.11.0)</li> <li>ipywidgets (&gt;=8.1.1)</li> <li>featuretools (&gt;=1.28.0)</li> <li>joblib (&gt;=1.3.1)</li> <li>matplotlib (&gt;=3.7.2)</li> <li>mlflow (&gt;=2.7.1)</li> <li>modin[ray] (&gt;=0.25.0)</li> <li>nltk (&gt;=3.8.1)</li> <li>numpy (&gt;=1.23.0)</li> <li>optuna (&gt;=3.4.0)</li> <li>pandas[parquet] (&gt;=2.1.2)</li> <li>plotly (&gt;=5.15.0)</li> <li>ray[serve] (&gt;=2.7.1)</li> <li>scikit-learn (&gt;=1.3.1)</li> <li>scikit-learn-intelex (&gt;=2023.2.1)</li> <li>scipy (&gt;=1.10.1)</li> <li>shap (&gt;=0.43.0)</li> <li>sktime (&gt;=0.24.0)</li> <li>zoofs (&gt;=0.1.26)</li> </ul>"}, {"location": "dependencies/#optional", "title": "Optional", "text": "<p>Some specific models, utility methods or plots require the installation of additional libraries. You can install all the optional dependencies using <code>pip install atom-ml[full]</code>. Doing so also installs the following libraries:</p> <ul> <li>botorch (&gt;=0.8.5)</li> <li>catboost (&gt;=1.2)</li> <li>explainerdashboard (&gt;=0.4.3)</li> <li>gradio (&gt;=3.44.4)</li> <li>lightgbm (&gt;=4.1.0)</li> <li>pmdarima (&gt;=2.0.3)</li> <li>schemdraw (&gt;=0.16)</li> <li>sweetviz (&gt;=2.3.1)</li> <li>wordcloud (&gt;=1.9.2)</li> <li>xgboost (&gt;=2.0.0)</li> </ul>"}, {"location": "dependencies/#development", "title": "Development", "text": "<p>The development dependencies are not installed with the package, and are not required for any of its functionalities. These libraries are only necessary to contribute to the project. Install them running <code>pdm install --dev</code> (don't forget to install pdm with <code>pip install -U pdm</code>).</p> <p>Linting</p> <ul> <li>isort (&gt;=5.12.0)</li> <li>flake8 (&gt;=6.0.0)</li> <li>flake8-pyproject (&gt;=1.2.3)</li> <li>pydocstyle (&gt;=6.3.0)</li> <li>mypy (&gt;=1.6.1)</li> <li>pandas_stubs (&gt;=2.1.1.230928)</li> <li>types-requests (&gt;=2.31.0.10)</li> </ul> <p>Testing</p> <ul> <li>nbmake (&gt;=1.4.1)</li> <li>pytest (&gt;=7.2.1)</li> <li>pytest-cov (&gt;=4.0.0)</li> <li>pytest-xdist (&gt;=3.2.0)</li> <li>scikeras (&gt;=0.11.0)</li> <li>tensorflow (&gt;=2.13.0)</li> </ul> <p>Documentation</p> <ul> <li>jupyter-contrib-nbextensions (&gt;=0.7.0)</li> <li>mike (&gt;=1.1.2)</li> <li>mkdocs (&gt;=1.5.3)</li> <li>mkdocs-autorefs (&gt;=0.5.0)</li> <li>mkdocs-jupyter (&gt;=0.24.6)</li> <li>mkdocs-material (&gt;=9.4.7)</li> <li>mkdocs-simple-hooks (&gt;=0.1.5)</li> <li>pymdown-extensions (&gt;=10.3.1)</li> <li>pyyaml (&gt;=6.0)</li> </ul>"}, {"location": "faq/", "title": "Frequently asked questions", "text": "<p>Here we try to give answers to some questions that have popped up regularly. If you have any other questions, don't hesitate to create a new discussion or post them on the Slack channel! </p> <p>??? faq Is this package related to the Atom text editor?\"     There is, indeed, a text editor with the same name and a similar logo as     this package. Is this a shameless copy? No. When I started the project,     I didn't know about the text editor, and it doesn't require much thinking     to come up with the idea of replacing the letter O of the word atom with     the image of an atom.</p> How does ATOM relate to AutoML? <p>ATOM is not an AutoML tool since it does not automate the search for an optimal pipeline like well-known AutoML tools such as auto-sklearn or EvalML do. Instead, ATOM helps the user find the optimal pipeline himself. One of the goals of this package is to help data scientists produce explainable pipelines, and using an AutoML black box function would impede that.</p> Is it possible to run deep learning models? <p>Yes. Deep learning models can be added as custom models to the pipeline as long as they follow sklearn's API. For more information, see the deep learning section of the user guide.</p> Can I run atom's methods on just a subset of the columns? <p>Yes, all data cleaning and feature engineering methods accept a <code>columns</code> parameter to only transform the selected features. For example, to only impute the numerical columns in the dataset we could type <code>atom.impute(strat_num=\"mean\", columns=atom.numerical)</code>. The parameter accepts column names, column indices, dtypes or a slice object.</p> How can I compare the same model on different datasets? <p>In many occasions you might want to test how a model performs on datasets processed with different pipelines. For this, atom has the branch system. Create a new branch for every new pipeline you want to test and use the plot methods to compare all models, independent of the branch it was trained on.</p> Can I train models through atom using a GPU? <p>Yes. Refer to the user guide to see what algorithms and models have a GPU implementation. Be aware that it could require additional software and hardware dependencies.</p> How are numerical and categorical columns differentiated? <p>The columns are separated using a dataframe's select_dtypes method. Numerical columns are selected using <code>include=\"number\"</code> whereas categorical columns are selected using <code>exclude=\"number\"</code>.</p> Can I run unsupervised learning pipelines? <p>No. As for now, ATOM only supports supervised machine learning pipelines. However, various unsupervised algorithms can be chosen as strategy in the Pruner class to detect and remove outliers from the dataset.</p> Is there a way to plot multiple models in the same shap plot? <p>No. Unfortunately, there is no way to plot multiple models in the same shap plot since the plots are made by the shap package and passed as <code>matplotlib.axes</code> objects to atom. This means that it's not within the reach of this package to implement such a utility.</p> Can I merge a sklearn pipeline with atom? <p>Yes. Like any other transformer, it is possible to add a sklearn pipeline to atom using the add method. Every transformer in the pipeline is merged independently. The pipeline is not allowed to end with a model since atom manages its own models. If that is the case, add the pipeline using <code>atom.add(pipeline[:-1])</code>.</p> Is it possible to initialize atom with an existing train and test set? <p>Yes. If you already have a separated train and test set you can initialize atom in two ways:</p> <ul> <li><code>atom = ATOMClassifier(train, test)</code></li> <li><code>atom = ATOMClassifier((X_train, y_train), (X_test, y_test))</code></li> </ul> <p>Make sure the train and test size have the same number of columns! If atom is initialized in any of these two ways, the <code>test_size</code> parameter is ignored.</p> Can I train the models using cross-validation? <p>Applying cross-validation means transforming every step of the pipeline multiple times, each with different results. Doing this would prevent ATOM from being able to show the transformation results after every pre-processing step, which means losing the ability to inspect how a transformer changed the dataset. For this reason, it is not possible to apply cross-validation until after a model has been trained. After a model has been trained, the pipeline is defined, and cross-validation can be applied using the cross_validate method. See here an example using cross-validation.</p> Is there a way to process datetime features? <p>Yes, the FeatureExtractor class can automatically extract useful features (day, month, year, etc...) from datetime columns. The extracted features are always encoded to numerical values, so they can be fed directly to a model.</p>"}, {"location": "getting_started/", "title": "Getting started", "text": ""}, {"location": "getting_started/#installation", "title": "Installation", "text": "<p>Install ATOM's newest release easily via <code>pip</code>:</p> <pre><code>pip install -U atom-ml\n</code></pre> <p>or via <code>conda</code>:</p> <pre><code>conda install -c conda-forge atom-ml\n</code></pre> <p>Note</p> <p>Since atom was already taken, download the package under the name <code>atom-ml</code>!</p> <p>Warning</p> <p>ATOM makes use of many other ML libraries, making its dependency list quite long. Because of that, the installation may take longer than you are accustomed to. Be patient!</p> <p></p> <p>Optional dependencies</p> <p>Some specific models, utility methods or plots require the installation of additional libraries. To install the optional dependencies, add <code>[full]</code> after the package's name.</p> <pre><code>pip install -U atom-ml[full]\n</code></pre> <p></p> <p>Latest source</p> <p>Sometimes, new features and bug fixes are already implemented in the <code>development</code> branch, but waiting for the next release to be made available. If you can't wait for that, it's possible to install the package directly from git.</p> <pre><code>pip install git+https://github.com/tvdboom/ATOM.git@development#egg=atom-ml\n</code></pre> <p>Don't forget to include <code>#egg=atom-ml</code> to explicitly name the project, this way pip can track metadata for it without having to have run the <code>setup.py</code> script.</p> <p></p> <p>Contributing</p> <p>If you are planning to contribute to the project, you'll need the development dependencies. Install them adding <code>[dev]</code> after the package's name.</p> <pre><code>pip install -U atom-ml[dev]\n</code></pre> <p>Click here for a complete list of package files for all versions published on PyPI.</p> <p></p>"}, {"location": "getting_started/#usage", "title": "Usage", "text": "<p>ATOM contains a variety of classes and functions to perform data cleaning, feature engineering, model training, plotting and much more. The easiest way to use everything ATOM has to offer is through one of the main classes:</p> <ul> <li>ATOMClassifier for classification tasks.</li> <li>ATOMForecaster for forecasting tasks.</li> <li>ATOMRegressor for regression tasks.</li> </ul> <p>Let's walk you through an example. Click on the SageMaker Studio Lab badge on top of this section to run this example yourself.</p> <p>Make the necessary imports and load the data.</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from atom import ATOMClassifier\n\n&gt;&gt;&gt; # Load the Australian Weather dataset\n&gt;&gt;&gt; X = pd.read_csv(\"./examples/datasets/weatherAUS.csv\", nrows=100)\n&gt;&gt;&gt; print(X.head())\n\n           Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine WindGustDir  WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm RainToday  RainTomorrow\n0  MelbourneAirport     18.0     26.9      21.4          7.0       8.9         SSE           41.0          W        SSE           9.0          20.0         95.0         54.0       1019.5       1017.0       8.0       5.0     18.5     26.0       Yes             0\n1          Adelaide     17.2     23.4       0.0          NaN       NaN           S           41.0          S        WSW          13.0          19.0         59.0         36.0       1015.7       1015.7       NaN       NaN     17.7     21.9        No             0\n2            Cairns     18.6     24.6       7.4          3.0       6.1         SSE           54.0        SSE         SE          26.0          35.0         78.0         57.0       1018.7       1016.6       3.0       3.0     20.8     24.1       Yes             0\n3          Portland     13.6     16.8       4.2          1.2       0.0         ESE           39.0        ESE        ESE          17.0          15.0         76.0         74.0       1021.4       1020.5       7.0       8.0     15.6     16.0       Yes             1\n4           Walpole     16.4     19.9       0.0          NaN       NaN          SE           44.0         SE         SE          19.0          30.0         78.0         70.0       1019.4       1018.9       NaN       NaN     17.4     18.1        No             0\n</code></pre> <p>Initialize the ATOMClassifier or ATOMRegressor class. These two classes are convenient wrappers for the whole machine learning pipeline. Contrary to sklearn's API, they are initialized providing the data you want to manipulate.</p> <pre><code>&gt;&gt;&gt; atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (100, 22)\nTrain set size: 80\nTest set size: 20\n-------------------------------------\nMemory: 17.73 kB\nScaled: False\nMissing values: 193 (8.8%)\nCategorical features: 5 (23.8%)\n</code></pre> <p>Data transformations are applied through atom's methods. For example, calling the impute method will initialize an Imputer instance, fit it on the training set and transform the whole dataset. The transformations are applied immediately after calling the method (no fit and transform commands necessary).</p> <pre><code>&gt;&gt;&gt; atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")  \n\nFitting Imputer...\nImputing missing values...\n --&gt; Imputing 1 missing values with median (0.0) in feature Rainfall.\n --&gt; Imputing 36 missing values with median (4.8) in feature Evaporation.\n --&gt; Imputing 38 missing values with median (8.45) in feature Sunshine.\n --&gt; Imputing 8 missing values with most_frequent (SSE) in feature WindGustDir.\n --&gt; Imputing 8 missing values with median (41.0) in feature WindGustSpeed.\n --&gt; Imputing 7 missing values with most_frequent (ESE) in feature WindDir9am.\n --&gt; Imputing 2 missing values with median (13.0) in feature WindSpeed9am.\n --&gt; Imputing 1 missing values with median (74.0) in feature Humidity9am.\n --&gt; Imputing 6 missing values with median (1017.55) in feature Pressure9am.\n --&gt; Imputing 6 missing values with median (1015.4) in feature Pressure3pm.\n --&gt; Imputing 38 missing values with median (5.5) in feature Cloud9am.\n --&gt; Imputing 40 missing values with median (5.0) in feature Cloud3pm.\n --&gt; Imputing 1 missing values with median (17.2) in feature Temp9am.\n --&gt; Imputing 1 missing values with most_frequent (No) in feature RainToday.\n\n&gt;&gt;&gt; atom.encode(strategy=\"Target\", max_onehot=8)\n\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 42 classes.\n   --&gt; Handling 2 unknown classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n   --&gt; Handling 1 unknown classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n</code></pre> <p>Similarly, models are trained and evaluated using the run method. Here, we fit both a LogisticRegression and LinearDiscriminantAnalysis model, and apply hyperparameter tuning.</p> <pre><code>&gt;&gt;&gt; atom.run(models=[\"LR\", \"LDA\"], metric=\"auc\", n_trials=6)\n\n\nTraining ========================= &gt;&gt;\nModels: LR, LDA\nMetric: auc\n\n\nRunning hyperparameter tuning for LogisticRegression...\n| trial | penalty |       C |  solver | max_iter | l1_ratio |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------- | ------- | ------- | -------- | -------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |      l2 |  1.1302 |     sag |      730 |      0.3 |  0.5417 |   0.5417 |     0.093s |  0.093s | COMPLETE |\n| 1     |    None |  0.1544 |   lbfgs |      120 |      0.5 |  0.8542 |   0.8542 |     0.092s |  0.185s | COMPLETE |\n| 2     |      l2 |  0.0027 |     sag |      460 |      0.4 |  0.5625 |   0.8542 |     0.090s |  0.275s | COMPLETE |\n| 3     |      l2 |  0.0062 |   lbfgs |      800 |      0.8 |  0.6042 |   0.8542 |     0.090s |  0.365s | COMPLETE |\n| 4     | elast.. |  4.2724 |    saga |      530 |      0.1 |  0.6042 |   0.8542 |     0.096s |  0.461s | COMPLETE |\n| 5     |      l2 |  1.3274 | newto.. |      680 |      0.3 |  0.5625 |   0.8542 |     0.093s |  0.555s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 1\nBest parameters:\n --&gt; penalty: None\n --&gt; C: 0.1544\n --&gt; solver: lbfgs\n --&gt; max_iter: 120\n --&gt; l1_ratio: 0.5\nBest evaluation --&gt; auc: 0.8542\nTime elapsed: 0.555s\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 1.0\nTest evaluation --&gt; auc: 0.4133\nTime elapsed: 0.074s\n-------------------------------------------------\nTime: 0.629s\n\n\nRunning hyperparameter tuning for LinearDiscriminantAnalysis...\n| trial |  solver | shrinkage |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |     svd |      None |  0.6458 |   0.6458 |     0.086s |  0.086s | COMPLETE |\n| 1     |    lsqr |       0.7 |  0.9375 |   0.9375 |     0.081s |  0.167s | COMPLETE |\n| 2     |     svd |       nan |  0.6458 |   0.9375 |     0.001s |  0.168s | COMPLETE |\n| 3     |    lsqr |       0.8 |   0.625 |   0.9375 |     0.079s |  0.247s | COMPLETE |\n| 4     |     svd |       nan |  0.6458 |   0.9375 |     0.000s |  0.247s | COMPLETE |\n| 5     |   eigen |       0.8 |    0.75 |   0.9375 |     0.078s |  0.326s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 1\nBest parameters:\n --&gt; solver: lsqr\n --&gt; shrinkage: 0.7\nBest evaluation --&gt; auc: 0.9375\nTime elapsed: 0.326s\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.8576\nTest evaluation --&gt; auc: 0.8933\nTime elapsed: 0.016s\n-------------------------------------------------\nTime: 0.342s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.005s\n-------------------------------------\nLogisticRegression         --&gt; auc: 0.4133 ~\nLinearDiscriminantAnalysis --&gt; auc: 0.8933 !\n</code></pre> <p>And lastly, analyze the results.</p> <pre><code>&gt;&gt;&gt; print(atom.evaluate())\n\n     accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR       0.60  0.2793  0.4000  0.0000      0.0 -0.2425       0.00     0.0  0.4667\nLDA      0.85  0.7944  0.7667  0.6667      0.5  0.5774       0.75     0.6  0.9067\n\n\n&gt;&gt;&gt; atom.plot_lift()\n</code></pre>"}, {"location": "license/", "title": "MIT License", "text": "<p>Copyright \u00a9 2023 Mavs</p> <p>Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:</p> <p>The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.</p> <p>THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.</p>"}, {"location": "API/ATOM/atomclassifier/", "title": "ATOMClassifier", "text": "<p>class atom.api.ATOMClassifier(*arrays, y=-1, index=False, shuffle=True, stratify=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for classification tasks.</p> <p>Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.</p> <p>All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.</p> <p>Parameters*arrays: sequence of indexables Dataset containing features and target. Allowed formats are: <ul> <li>X</li> <li>X, y</li> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).</p> <p>y: int, str or sequence Target column corresponding to `X`.</p> <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>y: int, str, dict, sequence or dataframe, default=-1 Target column corresponding to `X`. <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>This parameter is ignored if the target column is provided through <code>arrays</code>.</p> <p>index: bool, int, str or sequence, default=False Handle the index in the resulting dataframe. <ul> <li>If False: Reset to RangeIndex.</li> <li>If True: Use the provided index.</li> <li>If int: Position of the column to use as index.</li> <li>If str: Name of the column to use as index.</li> <li>If sequence: Array with shape=(n_samples,) to use as index.</li> </ul> <p>test_size: int or float, default=0.2 <ul> <li>If &lt;=1: Fraction of the dataset to include in the test set.</li> <li>If &gt;1: Number of rows to include in the test set.</li> </ul> <p>This parameter is ignored if the test set is provided through <code>arrays</code>.</p> <p>holdout_size: int, float or None, default=None <ul> <li>If None: No holdout data set is kept apart.</li> <li>If &lt;=1: Fraction of the dataset to include in the holdout set.</li> <li>If &gt;1: Number of rows to include in the holdout set.</li> </ul> <p>This parameter is ignored if the holdout set is provided through <code>arrays</code>.</p> <p>shuffle: bool, default=True Whether to shuffle the dataset before splitting the train and test set. Be aware that not shuffling the dataset can cause an unequal distribution of target classes over the sets. <p>stratify: bool, int, str or sequence, default=True Handle stratification of the target classes over the data sets. <ul> <li>If False: The data is split randomly.</li> <li>If True: The data is stratified over the target column.</li> <li>Else: Name or position of the columns to stratify by. The   columns can't contain <code>NaN</code> values.</li> </ul> <p>This parameter is ignored if <code>shuffle=False</code> or if the test set is provided through <code>arrays</code>.</p> <p>For multioutput tasks, stratification is applied to the joint target columns.</p> <p>n_rows: int or float, default=1 Random subsample of the dataset to use. The default value selects all rows. <ul> <li>If &lt;=1: Fraction of the dataset to select.</li> <li>If &gt;1: Exact number of rows to select. Only if <code>arrays</code> is X          or X, y.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMForecaster Main class for forecasting tasks.</p> <p>ATOMRegressor Main class for regression tasks.</p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Initialize atom\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 180 (1.3%)\n\n\n\n&gt;&gt;&gt; # Apply data cleaning and feature engineering methods\n&gt;&gt;&gt; atom.balance(strategy=\"smote\")\n\nOversampling with SMOTE...\n --&gt; Adding 116 samples to class 0.\n\n&gt;&gt;&gt; atom.feature_selection(strategy=\"rfe\", solver=\"lr\", n_features=22)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --&gt; rfe selected 22 features from the dataset.\n   --&gt; Dropping feature mean area (rank 7).\n   --&gt; Dropping feature mean compactness (rank 2).\n   --&gt; Dropping feature mean fractal dimension (rank 6).\n   --&gt; Dropping feature smoothness error (rank 9).\n   --&gt; Dropping feature concave points error (rank 4).\n   --&gt; Dropping feature fractal dimension error (rank 8).\n   --&gt; Dropping feature worst radius (rank 3).\n   --&gt; Dropping feature worst area (rank 5).\n\n\n&gt;&gt;&gt; # Train models\n&gt;&gt;&gt; atom.run(models=[\"LR\", \"RF\", \"XGB\"])\n\n\nTraining ========================= &gt;&gt;\nModels: LR, RF, XGB\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9878\nTest evaluation --&gt; f1: 0.9859\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9714\nTime elapsed: 0.251s\n-------------------------------------------------\nTime: 0.251s\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9718\nTime elapsed: 0.412s\n-------------------------------------------------\nTime: 0.412s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.759s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9859 !\nRandomForest       --&gt; f1: 0.9714\nXGBoost            --&gt; f1: 0.9718\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(atom.results)\n\n     f1_train  f1_test  time_fit      time\nLR     0.9878   0.9859  0.086078  0.086078\nRF     1.0000   0.9714  0.251238  0.251238\nXGB    1.0000   0.9718  0.412373  0.412373\n\n\n&gt;&gt;&gt; print(atom.evaluate())\n\n     accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR     0.9823  0.9975  0.9811  0.9859   0.9722  0.9621     0.9859  0.9859  0.9960\nRF     0.9646  0.9704  0.9670  0.9714   0.9444  0.9256     0.9855  0.9577  0.9670\nXGB    0.9646  0.9622  0.9621  0.9718   0.9452  0.9242     0.9718  0.9718  0.9621\n</code></pre>"}, {"location": "API/ATOM/atomclassifier/#magic-methods", "title": "Magic methods", "text": "<p>The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.</p> <ul> <li>__repr__: Prints an overview of atom's branches, models and metric.</li> <li>__len__: Returns the length of the dataset.</li> <li>__iter__: Iterate over the pipeline's transformers.</li> <li>__contains__: Checks if the provided item is a column in the dataset.</li> <li>__getitem__: Access a branch, model, column or subset of the dataset.</li> </ul> <p></p>"}, {"location": "API/ATOM/atomclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomclassifier/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributespipeline: PipelinePipeline of transforms. <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). scaled: boolWhether the feature set is scaled. <p>A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only zeros and ones) are excluded from the calculation. duplicates: int | numpy.integerNumber of duplicate rows in the dataset. missing: list[Any]Values that are considered \"missing\". <p>These values are used by the clean and impute methods. Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. nans: Series | modin.pandas.series.SeriesColumns with the number of missing values in them. <p>This property is unavailable for sparse datasets. n_nans: intNumber of rows containing missing values. <p>This property is unavailable for sparse datasets. numerical: IndexNames of the numerical features in the dataset. n_numerical: intNumber of numerical features in the dataset. categorical: IndexNames of the categorical features in the dataset. n_categorical: intNumber of categorical features in the dataset. outliers: SeriesColumns in training set with number of outlier values. <p>This property is unavailable for sparse datasets. n_outliers: int | numpy.integerNumber of samples in the training set containing outliers. <p>This property is unavailable for sparse datasets. classes: DataFrameDistribution of target classes per data set. <p>This property is only available for classification tasks. n_classes: int | numpy.integer | Series | modin.pandas.series.SeriesNumber of classes in the target column(s). <p>This property is only available for classification tasks. </p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesbranch: BranchCurrent active branch. <p>Use the property's <code>@setter</code> to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use <code>_from_</code> to split the new branch from any other existing branch. Read more in the user guide. models: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/ATOM/atomclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#utility-methods", "title": "Utility methods", "text": "<p>Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.</p> <p>addAdd a transformer to the pipeline.applyApply a function to the dataset.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoad an atom instance from a pickle file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a <code>.csv</code> file.shrinkConvert the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.</p> <p>If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.</p> <p>Warning</p> <ul> <li>The transformer should have fit and/or transform methods   with arguments <code>X</code> (accepting a dataframe-like object of   shape=(n_samples, n_features)) and/or <code>y</code> (accepting a   sequence of shape=(n_samples,)).</li> <li>The transform method should return a feature set as a   dataframe-like object of shape=(n_samples, n_features)   and/or a target column as a sequence of shape=(n_samples,).</li> </ul> <p>Note</p> <p>If the transform method doesn't return a dataframe:</p> <ul> <li>The column naming happens as follows. If the transformer   has a <code>get_feature_names_out</code> method, it is used. If not,   and it returns the same number of columns, the names are   kept equal. If the number of columns changes, old columns   will keep their name (as long as the column is unchanged)   and new columns will receive the name <code>x[N-1]</code>, where N   stands for the n-th feature. This means that a transformer   should only transform, add or drop columns, not   combinations of these.</li> <li>The index remains the same as before the transformation.   This means that the transformer should not add, remove or   shuffle rows unless it returns a dataframe.</li> </ul> <p>Note</p> <p>If the transformer has a <code>n_jobs</code> and/or <code>random_state</code> parameter that is left to its default value, it adopts atom's value.</p> <p>Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a <code>transform</code> method. <p>columns: int, str, segment, sequence or None, default=None Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns. <p>train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data. <p>**fit_params Additional keyword arguments for the transformer's fit method. </p> <p></p> <p>method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.</p> <p>This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...</p> <p>Note</p> <p>This approach is preferred over changing the dataset directly through the property's <code>@setter</code> since the transformation is stored in the pipeline.</p> <p>Tip</p> <p>Use <code>atom.apply(lambda df: df.drop(\"column_name\",axis=1))</code> to store the removal of columns in the pipeline.</p> <p>Parametersfunc: callable Function to apply with signature <code>func(dataset, **kw_args) -&gt; dataset</code>. <p>inverse_func: callable or None, default=None Inverse function of <code>func</code>. If None, the inverse_transform method returns the input unchanged. <p>kw_args: dict or None, default=None Additional keyword arguments for the function. <p>inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function. </p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.</p> <p>Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.</p> <p>Tip</p> <p>Use the plot_distribution method to plot a column's distribution.</p> <p>Parametersdistributions: str, sequence or None, default=None Names of the distributions in <code>scipy.stats</code> to get the statistics on. If None, a selection of the most common ones is used. <p>columns: int, str, segment, sequence or None, default=None Selection of columns to perform the test on. If None, select all numerical columns. <p>Returnspd.DataFrame Statistic results with multiindex levels: <ul> <li>dist: Name of the distribution.</li> <li>stat: Statistic results:<ul> <li>score: KS-test score.</li> <li>p_value: Corresponding p-value. </li> </ul> </li> </ul> <p></p> <p>method eda(rows=\"dataset\", target=0, filename=None)[source]Create an Exploratory Data Analysis report.</p> <p>ATOM uses the sweetviz package for EDA. The report is rendered directly in the notebook. It can also be accessed through the <code>report</code> attribute. It can either report one dataset or compare two datasets against each other.</p> <p>Warning</p> <p>This method can be slow for large datasets.</p> <p>Parametersrows: str, sequence or dict, default=\"dataset\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to report.</li> <li>If sequence: Names of two data sets to compare.</li> <li>If dict: Names of up to two data sets with corresponding   selection of rows to report.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. Only bool and numerical features can be used as target. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the (html) file to save. If None, don't save anything. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_sample_weight(rows=\"train\")[source]Return sample weights for a balanced data set.</p> <p>The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of <code>y</code> will be multiplied.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsseries Sequence of weights with shape=(n_samples,). </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement an <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>function atom.atom.load(filename, data=None)[source]Load an atom instance from a pickle file.</p> <p>If the instance was saved using <code>save_data=False</code>, it's possible to load new data into it and apply all data transformations.</p> <p>Info</p> <p>The loaded instance's current branch is the same branch as it was when saved.</p> <p>Parametersfilename: str or Path Filename or pathlib.Path of the pickle file. <p>data: tuple of indexables or None, default=None Original dataset as it was provided to the instance's constructor. Only use this parameter if the loaded file was saved using <code>save_data=False</code>. Allowed formats are: <ul> <li>X</li> <li>X, y</li> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).</p> <p>y: int, str or sequence Target column corresponding to `X`.</p> <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsatom Unpickled atom instance. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset(hard=False)[source]Reset the instance to it's initial state.</p> <p>Deletes all branches and models. The dataset is also reset to its form after initialization.</p> <p>Parametershard: bool, default=False If True, flushes completely the cache. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method save_data(filename=\"auto\", rows=\"dataset\", **kwargs)[source]Save the data in the current branch to a <code>.csv</code> file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows to save. <p>**kwargs Additional keyword arguments for pandas' to_csv method. </p> <p></p> <p>method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Convert the columns to the smallest possible matching dtype.</p> <p>Examples are: float64 -&gt; float32, int64 -&gt; int8, etc... Sparse arrays also transform their non-fill value. Use this method for memory optimization before saving the dataset. Note that applying transformers to the data may alter the types again.</p> <p>Parametersint2bool: bool, default=False Whether to convert <code>int</code> columns to <code>bool</code> type. Only if the values in the column are strictly in (0, 1) or (-1, 1). <p>int2uint: bool, default=False Whether to convert <code>int</code> to <code>uint</code> (unsigned integer). Only if the values in the column are strictly positive. <p>str2cat: bool, default=False Whether to convert <code>string</code> to <code>category</code>. Only if the number of categories is less than 30% of the column's length. <p>dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column. <p>columns: int, str, segment, sequence or None, default=None Selection of columns to shrink. If None, transform all columns. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method stats()[source]Display basic information about the dataset.</p> <p></p> <p>method status()[source]Get an overview of the branches and models.</p> <p>This method prints the same information as the __repr__ and also saves it to the logger.</p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be  of use to, for example, transform only the target column.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#data-cleaning", "title": "Data cleaning", "text": "<p>The data cleaning methods can help you scale the data, handle missing values, categorical columns, outliers and unbalanced datasets. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.</p> <p>Tip</p> <p>Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.</p> <p>balanceBalance the number of rows per class in the target column.cleanApply standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.</p> <p></p> <p>method balance(strategy=\"adasyn\", **kwargs)[source]Balance the number of rows per class in the target column.</p> <p>When oversampling, the newly created samples have an increasing integer index for numerical indices, and an index of the form [estimator]_N for non-numerical indices, where N stands for the N-th sample in the data set.</p> <p>See the Balancer class for a description of the parameters.</p> <p>Warning</p> <ul> <li>The balance method does not support multioutput tasks.</li> <li>This transformation is only applied to the training set   in order to maintain the original distribution of target   classes in the test set.</li> </ul> <p>Tip</p> <p>Use atom's classes attribute for an overview of the target class distribution per data set.</p> <p></p> <p>method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Apply standard data cleaning steps on the dataset.</p> <p>Use the parameters to choose which transformations to perform. The available steps are:</p> <ul> <li>Convert dtypes to the best possible types.</li> <li>Drop columns with specific data types.</li> <li>Remove characters from column names.</li> <li>Strip categorical features from spaces.</li> <li>Drop duplicate rows.</li> <li>Drop rows with missing values in the target column.</li> <li>Encode the target column (ignored for regression tasks).</li> </ul> <p>See the Cleaner class for a description of the parameters.</p> <p></p> <p>method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.</p> <p>For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.</p> <p>See the Discretizer class for a description of the parameters.</p> <p>Tip</p> <p>Use the plot_distribution method to visualize a column's distribution and decide on the bins.</p> <p></p> <p>method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.</p> <p>The encoding type depends on the number of classes in the column:</p> <ul> <li>If n_classes=2 or ordinal feature, use Ordinal-encoding.</li> <li>If 2 &lt; n_classes &lt;= <code>max_onehot</code>, use OneHot-encoding.</li> <li>If n_classes &gt; <code>max_onehot</code>, use <code>strategy</code>-encoding.</li> </ul> <p>Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.</p> <p>See the Encoder class for a description of the parameters.</p> <p>Note</p> <p>This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.</p> <p>Tip</p> <p>Use the categorical attribute for a list of the categorical features in the dataset.</p> <p></p> <p>method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.</p> <p>Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the <code>missing</code> attribute to customize what are considered \"missing values\".</p> <p>See the Imputer class for a description of the parameters.</p> <p>Tip</p> <p>Use the nans attribute to check the amount of missing values per column.</p> <p></p> <p>method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.</p> <p>This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.</p> <p>See the Normalizer class for a description of the parameters.</p> <p>Tip</p> <p>Use the plot_distribution method to examine a column's distribution.</p> <p></p> <p>method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.</p> <p>Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.</p> <p>See the Pruner class for a description of the parameters.</p> <p>Note</p> <p>This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.</p> <p>Tip</p> <p>Use the outliers attribute to check the number of outliers per column.</p> <p></p> <p>method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.</p> <p>Apply one of sklearn's scalers. Categorical columns are ignored.</p> <p>See the Scaler class for a description of the parameters.</p> <p>Tip</p> <p>Use the scaled attribute to check whether the dataset is scaled.</p> <p></p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#nlp", "title": "NLP", "text": "<p>The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called <code>corpus</code>. Read more in the user guide.</p> <p>textcleanApply standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.</p> <p></p> <p>method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Apply standard text cleaning to the corpus.</p> <p>Transformations include normalizing characters and drop noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised.</p> <p>See the TextCleaner class for a description of the parameters.</p> <p></p> <p>method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.</p> <p>Convert words to a more uniform standard. The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.</p> <p>See the TextNormalizer class for a description of the parameters.</p> <p></p> <p>method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.</p> <p>Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>See the Tokenizer class for a description of the parameters.</p> <p></p> <p>method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.</p> <p>Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix <code>corpus_</code>. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.</p> <p>See the Vectorizer class for a description of the parameters.</p> <p></p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#feature-engineering", "title": "Feature engineering", "text": "<p>To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.</p> <p>feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.</p> <p></p> <p>method feature_extraction(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.</p> <p>Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype <code>datetime64</code> are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.</p> <p>See the FeatureExtractor class for a description of the parameters.</p> <p></p> <p>method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.</p> <p>Create new combinations of existing features to capture the non-linear relations between the original features.</p> <p>See the FeatureGenerator class for a description of the parameters.</p> <p></p> <p>method feature_grouping(groups, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.</p> <p>Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the <code>groups</code> method.</p> <p>See the FeatureGrouper class for a description of the parameters.</p> <p>Tip</p> <p>Use a regex pattern with the <code>groups</code> parameter to select groups easier, e.g., <code>atom.feature_grouping({\"group1\": \"var_.+\")</code> to select all features that start with <code>var_</code>.</p> <p></p> <p>method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.</p> <p>Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.</p> <p>See the FeatureSelector class for a description of the parameters.</p> <p>Note</p> <ul> <li>When strategy=\"univariate\" and solver=None, f_classif   or f_regression is used as default solver.</li> <li>When strategy is \"sfs\", \"rfecv\" or any of the   advanced strategies and no scoring is specified,   atom's metric (if it exists) is used as scoring.</li> </ul> <p></p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#training", "title": "Training", "text": "<p>The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.</p> <p>runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.</p> <p></p> <p>method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.</p> <p>Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.</p> <p>The following steps are applied to every model:</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the DirectClassifier or DirectRegressor class for a description of the parameters.</p> <p></p> <p>method successive_halving(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.</p> <p>The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g., only using tree-based models.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.</p> <p></p> <p>method train_sizing(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.</p> <p>When training models, there is usually a trade-off between model performance and computation time; that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.</p> <p></p>"}, {"location": "API/ATOM/atomforecaster/", "title": "ATOMForecaster", "text": "<p>class atom.api.ATOMForecaster(*arrays, y=-1, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for forecasting tasks.</p> <p>Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.</p> <p>All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.</p> <p>Parameters*arrays: sequence of indexables Dataset containing exogeneous features and time series. Allowed formats are: <ul> <li>X</li> <li>y</li> <li>X, y</li> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>X, train, test: dataframe-like Exogeneous feature set corresponding to y, with shape=(n_samples, n_features).</p> <p>y: int, str or sequence Time series.</p> <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>y: int, str, dict, sequence or dataframe, default=-1 Time series. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>This parameter is ignored if the time series is provided through <code>arrays</code>.</p> <p>test_size: int or float, default=0.2 <ul> <li>If &lt;=1: Fraction of the dataset to include in the test set.</li> <li>If &gt;1: Number of rows to include in the test set.</li> </ul> <p>This parameter is ignored if the test set is provided through <code>arrays</code>.</p> <p>holdout_size: int, float or None, default=None <ul> <li>If None: No holdout data set is kept apart.</li> <li>If &lt;=1: Fraction of the dataset to include in the holdout set.</li> <li>If &gt;1: Number of rows to include in the holdout set.</li> </ul> <p>This parameter is ignored if the holdout set is provided through <code>arrays</code>.</p> <p>n_rows: int or float, default=1 Subsample of the dataset to use. The cut is made from the head of the dataset (older entries are dropped when sorted by date ascending). The default value selects all rows. <ul> <li>If &lt;=1: Fraction of the dataset to select.</li> <li>If &gt;1: Exact number of rows to select. Only if <code>arrays</code> is X          or X, y.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMClassifier Main class for classification tasks.</p> <p>ATOMRegressor Main class for regression tasks.</p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; # Initialize atom\n&gt;&gt;&gt; atom = ATOMForecaster(y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Univariate forecast.\n\nDataset stats ==================== &gt;&gt;\nShape: (144, 1)\nTrain set size: 116\n --&gt; From: 1949-01  To: 1958-08\nTest set size: 28\n --&gt; From: 1958-09  To: 1960-12\n-------------------------------------\nMemory: 6.47 kB\nDuplicates: 26 (18.1%)\n\n\n\n&gt;&gt;&gt; # Train models\n&gt;&gt;&gt; atom.run(models=[\"NF\", \"ES\", \"ETS\"])\n\n\nTraining ========================= &gt;&gt;\nModels: NF, ES, ETS\nMetric: mape\n\n\nResults for NaiveForecaster:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0858\nTest evaluation --&gt; mape: -0.2305\nTime elapsed: 0.025s\n-------------------------------------------------\nTime: 0.025s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0864\nTest evaluation --&gt; mape: -0.2303\nTime elapsed: 0.042s\n-------------------------------------------------\nTime: 0.042s\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0858\nTest evaluation --&gt; mape: -0.2305\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.090s\n-------------------------------------\nNaiveForecaster      --&gt; mape: -0.2305\nExponentialSmoothing --&gt; mape: -0.2303 !\nETS                  --&gt; mape: -0.2305\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(atom.results)\n\n     mape_train  mape_test  time_fit      time\nNF      -0.0858    -0.2305  0.025023  0.025023\nES      -0.0864    -0.2303  0.042052  0.042052\nETS     -0.0858    -0.2305  0.021019  0.021019\n\n\n&gt;&gt;&gt; print(atom.evaluate())\n\n         mae    mape         mse      r2      rmse\nNF  -91.8571 -0.2305 -10656.7143 -0.7278 -103.2314\nES  -91.8163 -0.2303 -10647.1506 -0.7263 -103.1850\nETS -91.8563 -0.2305 -10656.5266 -0.7278 -103.2305\n</code></pre>"}, {"location": "API/ATOM/atomforecaster/#magic-methods", "title": "Magic methods", "text": "<p>The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.</p> <ul> <li>__repr__: Prints an overview of atom's branches, models and metric.</li> <li>__len__: Returns the length of the dataset.</li> <li>__iter__: Iterate over the pipeline's transformers.</li> <li>__contains__: Checks if the provided item is a column in the dataset.</li> <li>__getitem__: Access a branch, model, column or subset of the dataset.</li> </ul> <p></p>"}, {"location": "API/ATOM/atomforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomforecaster/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributespipeline: PipelinePipeline of transforms. <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). scaled: boolWhether the feature set is scaled. <p>A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only zeros and ones) are excluded from the calculation. duplicates: int | numpy.integerNumber of duplicate rows in the dataset. missing: list[Any]Values that are considered \"missing\". <p>These values are used by the clean and impute methods. Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. nans: Series | modin.pandas.series.SeriesColumns with the number of missing values in them. <p>This property is unavailable for sparse datasets. n_nans: intNumber of rows containing missing values. <p>This property is unavailable for sparse datasets. numerical: IndexNames of the numerical features in the dataset. n_numerical: intNumber of numerical features in the dataset. categorical: IndexNames of the categorical features in the dataset. n_categorical: intNumber of categorical features in the dataset. outliers: SeriesColumns in training set with number of outlier values. <p>This property is unavailable for sparse datasets. n_outliers: int | numpy.integerNumber of samples in the training set containing outliers. <p>This property is unavailable for sparse datasets. </p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesbranch: BranchCurrent active branch. <p>Use the property's <code>@setter</code> to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use <code>_from_</code> to split the new branch from any other existing branch. Read more in the user guide. models: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/ATOM/atomforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#utility-methods", "title": "Utility methods", "text": "<p>Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.</p> <p>addAdd a transformer to the pipeline.applyApply a function to the dataset.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoad an atom instance from a pickle file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a <code>.csv</code> file.shrinkConvert the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.</p> <p>If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.</p> <p>Warning</p> <ul> <li>The transformer should have fit and/or transform methods   with arguments <code>X</code> (accepting a dataframe-like object of   shape=(n_samples, n_features)) and/or <code>y</code> (accepting a   sequence of shape=(n_samples,)).</li> <li>The transform method should return a feature set as a   dataframe-like object of shape=(n_samples, n_features)   and/or a target column as a sequence of shape=(n_samples,).</li> </ul> <p>Note</p> <p>If the transform method doesn't return a dataframe:</p> <ul> <li>The column naming happens as follows. If the transformer   has a <code>get_feature_names_out</code> method, it is used. If not,   and it returns the same number of columns, the names are   kept equal. If the number of columns changes, old columns   will keep their name (as long as the column is unchanged)   and new columns will receive the name <code>x[N-1]</code>, where N   stands for the n-th feature. This means that a transformer   should only transform, add or drop columns, not   combinations of these.</li> <li>The index remains the same as before the transformation.   This means that the transformer should not add, remove or   shuffle rows unless it returns a dataframe.</li> </ul> <p>Note</p> <p>If the transformer has a <code>n_jobs</code> and/or <code>random_state</code> parameter that is left to its default value, it adopts atom's value.</p> <p>Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a <code>transform</code> method. <p>columns: int, str, segment, sequence or None, default=None Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns. <p>train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data. <p>**fit_params Additional keyword arguments for the transformer's fit method. </p> <p></p> <p>method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.</p> <p>This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...</p> <p>Note</p> <p>This approach is preferred over changing the dataset directly through the property's <code>@setter</code> since the transformation is stored in the pipeline.</p> <p>Tip</p> <p>Use <code>atom.apply(lambda df: df.drop(\"column_name\",axis=1))</code> to store the removal of columns in the pipeline.</p> <p>Parametersfunc: callable Function to apply with signature <code>func(dataset, **kw_args) -&gt; dataset</code>. <p>inverse_func: callable or None, default=None Inverse function of <code>func</code>. If None, the inverse_transform method returns the input unchanged. <p>kw_args: dict or None, default=None Additional keyword arguments for the function. <p>inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function. </p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.</p> <p>Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.</p> <p>Tip</p> <p>Use the plot_distribution method to plot a column's distribution.</p> <p>Parametersdistributions: str, sequence or None, default=None Names of the distributions in <code>scipy.stats</code> to get the statistics on. If None, a selection of the most common ones is used. <p>columns: int, str, segment, sequence or None, default=None Selection of columns to perform the test on. If None, select all numerical columns. <p>Returnspd.DataFrame Statistic results with multiindex levels: <ul> <li>dist: Name of the distribution.</li> <li>stat: Statistic results:<ul> <li>score: KS-test score.</li> <li>p_value: Corresponding p-value. </li> </ul> </li> </ul> <p></p> <p>method eda(rows=\"dataset\", target=0, filename=None)[source]Create an Exploratory Data Analysis report.</p> <p>ATOM uses the sweetviz package for EDA. The report is rendered directly in the notebook. It can also be accessed through the <code>report</code> attribute. It can either report one dataset or compare two datasets against each other.</p> <p>Warning</p> <p>This method can be slow for large datasets.</p> <p>Parametersrows: str, sequence or dict, default=\"dataset\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to report.</li> <li>If sequence: Names of two data sets to compare.</li> <li>If dict: Names of up to two data sets with corresponding   selection of rows to report.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. Only bool and numerical features can be used as target. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the (html) file to save. If None, don't save anything. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_sample_weight(rows=\"train\")[source]Return sample weights for a balanced data set.</p> <p>The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of <code>y</code> will be multiplied.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsseries Sequence of weights with shape=(n_samples,). </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement an <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>function atom.atom.load(filename, data=None)[source]Load an atom instance from a pickle file.</p> <p>If the instance was saved using <code>save_data=False</code>, it's possible to load new data into it and apply all data transformations.</p> <p>Info</p> <p>The loaded instance's current branch is the same branch as it was when saved.</p> <p>Parametersfilename: str or Path Filename or pathlib.Path of the pickle file. <p>data: tuple of indexables or None, default=None Original dataset as it was provided to the instance's constructor. Only use this parameter if the loaded file was saved using <code>save_data=False</code>. Allowed formats are: <ul> <li>X</li> <li>X, y</li> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).</p> <p>y: int, str or sequence Target column corresponding to `X`.</p> <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsatom Unpickled atom instance. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset(hard=False)[source]Reset the instance to it's initial state.</p> <p>Deletes all branches and models. The dataset is also reset to its form after initialization.</p> <p>Parametershard: bool, default=False If True, flushes completely the cache. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method save_data(filename=\"auto\", rows=\"dataset\", **kwargs)[source]Save the data in the current branch to a <code>.csv</code> file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows to save. <p>**kwargs Additional keyword arguments for pandas' to_csv method. </p> <p></p> <p>method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Convert the columns to the smallest possible matching dtype.</p> <p>Examples are: float64 -&gt; float32, int64 -&gt; int8, etc... Sparse arrays also transform their non-fill value. Use this method for memory optimization before saving the dataset. Note that applying transformers to the data may alter the types again.</p> <p>Parametersint2bool: bool, default=False Whether to convert <code>int</code> columns to <code>bool</code> type. Only if the values in the column are strictly in (0, 1) or (-1, 1). <p>int2uint: bool, default=False Whether to convert <code>int</code> to <code>uint</code> (unsigned integer). Only if the values in the column are strictly positive. <p>str2cat: bool, default=False Whether to convert <code>string</code> to <code>category</code>. Only if the number of categories is less than 30% of the column's length. <p>dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column. <p>columns: int, str, segment, sequence or None, default=None Selection of columns to shrink. If None, transform all columns. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method stats()[source]Display basic information about the dataset.</p> <p></p> <p>method status()[source]Get an overview of the branches and models.</p> <p>This method prints the same information as the __repr__ and also saves it to the logger.</p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be  of use to, for example, transform only the target column.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#data-cleaning", "title": "Data cleaning", "text": "<p>The data cleaning methods can help you scale the data, handle missing values, categorical columns and outliers. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.</p> <p>Tip</p> <p>Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.</p> <p>cleanApply standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.</p> <p></p> <p>method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Apply standard data cleaning steps on the dataset.</p> <p>Use the parameters to choose which transformations to perform. The available steps are:</p> <ul> <li>Convert dtypes to the best possible types.</li> <li>Drop columns with specific data types.</li> <li>Remove characters from column names.</li> <li>Strip categorical features from spaces.</li> <li>Drop duplicate rows.</li> <li>Drop rows with missing values in the target column.</li> <li>Encode the target column (ignored for regression tasks).</li> </ul> <p>See the Cleaner class for a description of the parameters.</p> <p></p> <p>method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.</p> <p>For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.</p> <p>See the Discretizer class for a description of the parameters.</p> <p>Tip</p> <p>Use the plot_distribution method to visualize a column's distribution and decide on the bins.</p> <p></p> <p>method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.</p> <p>The encoding type depends on the number of classes in the column:</p> <ul> <li>If n_classes=2 or ordinal feature, use Ordinal-encoding.</li> <li>If 2 &lt; n_classes &lt;= <code>max_onehot</code>, use OneHot-encoding.</li> <li>If n_classes &gt; <code>max_onehot</code>, use <code>strategy</code>-encoding.</li> </ul> <p>Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.</p> <p>See the Encoder class for a description of the parameters.</p> <p>Note</p> <p>This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.</p> <p>Tip</p> <p>Use the categorical attribute for a list of the categorical features in the dataset.</p> <p></p> <p>method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.</p> <p>Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the <code>missing</code> attribute to customize what are considered \"missing values\".</p> <p>See the Imputer class for a description of the parameters.</p> <p>Tip</p> <p>Use the nans attribute to check the amount of missing values per column.</p> <p></p> <p>method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.</p> <p>This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.</p> <p>See the Normalizer class for a description of the parameters.</p> <p>Tip</p> <p>Use the plot_distribution method to examine a column's distribution.</p> <p></p> <p>method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.</p> <p>Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.</p> <p>See the Pruner class for a description of the parameters.</p> <p>Note</p> <p>This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.</p> <p>Tip</p> <p>Use the outliers attribute to check the number of outliers per column.</p> <p></p> <p>method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.</p> <p>Apply one of sklearn's scalers. Categorical columns are ignored.</p> <p>See the Scaler class for a description of the parameters.</p> <p>Tip</p> <p>Use the scaled attribute to check whether the dataset is scaled.</p> <p></p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#nlp", "title": "NLP", "text": "<p>The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called <code>corpus</code>. Read more in the user guide.</p> <p>textcleanApply standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.</p> <p></p> <p>method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Apply standard text cleaning to the corpus.</p> <p>Transformations include normalizing characters and drop noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised.</p> <p>See the TextCleaner class for a description of the parameters.</p> <p></p> <p>method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.</p> <p>Convert words to a more uniform standard. The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.</p> <p>See the TextNormalizer class for a description of the parameters.</p> <p></p> <p>method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.</p> <p>Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>See the Tokenizer class for a description of the parameters.</p> <p></p> <p>method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.</p> <p>Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix <code>corpus_</code>. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.</p> <p>See the Vectorizer class for a description of the parameters.</p> <p></p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#feature-engineering", "title": "Feature engineering", "text": "<p>To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.</p> <p>feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.</p> <p></p> <p>method feature_extraction(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.</p> <p>Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype <code>datetime64</code> are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.</p> <p>See the FeatureExtractor class for a description of the parameters.</p> <p></p> <p>method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.</p> <p>Create new combinations of existing features to capture the non-linear relations between the original features.</p> <p>See the FeatureGenerator class for a description of the parameters.</p> <p></p> <p>method feature_grouping(groups, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.</p> <p>Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the <code>groups</code> method.</p> <p>See the FeatureGrouper class for a description of the parameters.</p> <p>Tip</p> <p>Use a regex pattern with the <code>groups</code> parameter to select groups easier, e.g., <code>atom.feature_grouping({\"group1\": \"var_.+\")</code> to select all features that start with <code>var_</code>.</p> <p></p> <p>method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.</p> <p>Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.</p> <p>See the FeatureSelector class for a description of the parameters.</p> <p>Note</p> <ul> <li>When strategy=\"univariate\" and solver=None, f_classif   or f_regression is used as default solver.</li> <li>When strategy is \"sfs\", \"rfecv\" or any of the   advanced strategies and no scoring is specified,   atom's metric (if it exists) is used as scoring.</li> </ul> <p></p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#training", "title": "Training", "text": "<p>The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.</p> <p>runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.</p> <p></p> <p>method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.</p> <p>Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.</p> <p>The following steps are applied to every model:</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the DirectClassifier or DirectRegressor class for a description of the parameters.</p> <p></p> <p>method successive_halving(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.</p> <p>The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g., only using tree-based models.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.</p> <p></p> <p>method train_sizing(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.</p> <p>When training models, there is usually a trade-off between model performance and computation time; that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.</p> <p></p>"}, {"location": "API/ATOM/atommodel/", "title": "ATOMModel", "text": "<p>function atom.api.ATOMModel(estimator, name=None, acronym=None, needs_scaling=False, native_multilabel=False, native_multioutput=False, has_validation=None)[source]Convert an estimator to a model that can be ingested by atom.</p> <p>This function adds the relevant attributes to the estimator so that they can be used by atom. Note that only estimators that follow sklearn's API are compatible.</p> <p>Read more about custom models in the user guide.</p> <p>Parametersestimator: Predictor Custom estimator. Should implement a <code>fit</code> and <code>predict</code> method. <p>name: str or None, default=None Name for the model. This is the value used to call the model from atom. The value should start with the model's <code>acronym</code> when specified. If None, the capital letters of the estimator's name are used (only if two or more, else it uses the entire name). <p>acronym: str or None, default=None Model's acronym. If None, it uses the model's <code>name</code>. Specify this parameter when you want to train multiple custom models that share the same estimator. <p>needs_scaling: bool, default=False Whether the model should use automated feature scaling. <p>native_multilabel: bool, default=False Whether the model has native support for multilabel tasks. If False and the task is multilabel, a multilabel meta-estimator is wrapper around the estimator. <p>native_multioutput: bool, default=False Whether the model has native support for multioutput tasks. If False and the task is multioutput, a multioutput meta-estimator is wrapped around the estimator. <p>has_validation: str or None, default=None Whether the model allows in-training validation. <ul> <li>If None: No support for in-training validation.</li> <li>If str: Name of the estimator's parameter that states the   number of iterations, e.g., <code>n_estimators</code> for   RandomForestClassifier.</li> </ul> <p>ReturnsPredictor Estimator with provided information. Provide this instance to the <code>models</code> parameter of the run method. <p></p>"}, {"location": "API/ATOM/atommodel/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor, ATOMModel\n&gt;&gt;&gt; from sklearn.datasets import load_diabetes\n&gt;&gt;&gt; from sklearn.linear_model import RANSACRegressor\n\n&gt;&gt;&gt; ransac = ATOMModel(\n...     estimator=RANSACRegressor(),\n...     name=\"RANSAC\",\n...     needs_scaling=False,\n... )\n\n&gt;&gt;&gt; X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Regression.\n\nDataset stats ==================== &gt;&gt;\nShape: (442, 11)\nTrain set size: 354\nTest set size: 88\n-------------------------------------\nMemory: 39.03 kB\nScaled: False\nOutlier values: 12 (0.3%)\n\n\n&gt;&gt;&gt; atom.run(ransac)\n\n\nTraining ========================= &gt;&gt;\nModels: RANSAC\nMetric: r2\n\n\nResults for RANSACRegressor:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.2946\nTest evaluation --&gt; r2: 0.3787\nTime elapsed: 0.059s\n-------------------------------------------------\nTime: 0.059s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.060s\n-------------------------------------\nRANSACRegressor --&gt; r2: 0.3787\n</code></pre>"}, {"location": "API/ATOM/atomregressor/", "title": "ATOMRegressor", "text": "<p>class atom.api.ATOMRegressor(*arrays, y=-1, index=False, shuffle=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for regression tasks.</p> <p>Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.</p> <p>All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.</p> <p>Parameters*arrays: sequence of indexables Dataset containing features and target. Allowed formats are: <ul> <li>X</li> <li>X, y</li> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).</p> <p>y: int, str or sequence Target column corresponding to `X`.</p> <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>y: int, str, dict, sequence or dataframe, default=-1 Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>This parameter is ignored if the target column is provided through <code>arrays</code>.</p> <p>index: bool, int, str or sequence, default=False Handle the index in the resulting dataframe. <ul> <li>If False: Reset to RangeIndex.</li> <li>If True: Use the provided index.</li> <li>If int: Position of the column to use as index.</li> <li>If str: Name of the column to use as index.</li> <li>If sequence: Array with shape=(n_samples,) to use as index.</li> </ul> <p>test_size: int or float, default=0.2 <ul> <li>If &lt;=1: Fraction of the dataset to include in the test set.</li> <li>If &gt;1: Number of rows to include in the test set.</li> </ul> <p>This parameter is ignored if the test set is provided through <code>arrays</code>.</p> <p>holdout_size: int, float or None, default=None <ul> <li>If None: No holdout data set is kept apart.</li> <li>If &lt;=1: Fraction of the dataset to include in the holdout set.</li> <li>If &gt;1: Number of rows to include in the holdout set.</li> </ul> <p>This parameter is ignored if the holdout set is provided through <code>arrays</code>.</p> <p>shuffle: bool, default=True Whether to shuffle the dataset before splitting the train and test set. Be aware that not shuffling the dataset can cause an unequal distribution of target classes over the sets. <p>n_rows: int or float, default=1 Random subsample of the dataset to use. The default value selects all rows. <ul> <li>If &lt;=1: Fraction of the dataset to select.</li> <li>If &gt;1: Exact number of rows to select. Only if <code>arrays</code> is X          or X, y.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMClassifier Main class for classification tasks.</p> <p>ATOMForecaster Main class for forecasting tasks.</p> <p></p>"}, {"location": "API/ATOM/atomregressor/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_diabetes\n\n&gt;&gt;&gt; X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Initialize atom\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Regression.\n\nDataset stats ==================== &gt;&gt;\nShape: (442, 11)\nTrain set size: 354\nTest set size: 88\n-------------------------------------\nMemory: 39.03 kB\nScaled: False\nOutlier values: 11 (0.3%)\n\n\n\n&gt;&gt;&gt; # Apply data cleaning and feature engineering methods\n&gt;&gt;&gt; atom.scale()\n\nFitting Scaler...\nScaling features...\n\n&gt;&gt;&gt; atom.feature_selection(strategy=\"rfecv\", solver=\"xgb\", n_features=12)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --&gt; rfecv selected 10 features from the dataset.\n\n\n&gt;&gt;&gt; # Train models\n&gt;&gt;&gt; atom.run(models=[\"OLS\", \"RF\", \"XGB\"])\n\n\nTraining ========================= &gt;&gt;\nModels: OLS, RF, XGB\nMetric: r2\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.5313\nTest evaluation --&gt; r2: 0.4452\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.9203\nTest evaluation --&gt; r2: 0.3471\nTime elapsed: 0.434s\n-------------------------------------------------\nTime: 0.434s\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 1.0\nTest evaluation --&gt; r2: 0.2881\nTime elapsed: 0.187s\n-------------------------------------------------\nTime: 0.187s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.645s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: 0.4452 !\nRandomForest         --&gt; r2: 0.3471 ~\nXGBoost              --&gt; r2: 0.2881 ~\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(atom.results)\n\n     r2_train  r2_test  time_fit      time\nOLS    0.5313   0.4452  0.020018  0.020018\nRF     0.9203   0.3471  0.434395  0.434395\nXGB    1.0000   0.2881  0.187170  0.187170\n\n\n&gt;&gt;&gt; print(atom.evaluate())\n\n         mae    mape        mse      r2     rmse\nOLS -45.1949 -0.4267 -3172.9439  0.4452 -56.3289\nRF  -49.8684 -0.4612 -3733.6766  0.3471 -61.1038\nXGB -52.0370 -0.4708 -4071.0416  0.2881 -63.8047\n</code></pre>"}, {"location": "API/ATOM/atomregressor/#magic-methods", "title": "Magic methods", "text": "<p>The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.</p> <ul> <li>__repr__: Prints an overview of atom's branches, models and metric.</li> <li>__len__: Returns the length of the dataset.</li> <li>__iter__: Iterate over the pipeline's transformers.</li> <li>__contains__: Checks if the provided item is a column in the dataset.</li> <li>__getitem__: Access a branch, model, column or subset of the dataset.</li> </ul> <p></p>"}, {"location": "API/ATOM/atomregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomregressor/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributespipeline: PipelinePipeline of transforms. <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). scaled: boolWhether the feature set is scaled. <p>A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only zeros and ones) are excluded from the calculation. duplicates: int | numpy.integerNumber of duplicate rows in the dataset. missing: list[Any]Values that are considered \"missing\". <p>These values are used by the clean and impute methods. Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. nans: Series | modin.pandas.series.SeriesColumns with the number of missing values in them. <p>This property is unavailable for sparse datasets. n_nans: intNumber of rows containing missing values. <p>This property is unavailable for sparse datasets. numerical: IndexNames of the numerical features in the dataset. n_numerical: intNumber of numerical features in the dataset. categorical: IndexNames of the categorical features in the dataset. n_categorical: intNumber of categorical features in the dataset. outliers: SeriesColumns in training set with number of outlier values. <p>This property is unavailable for sparse datasets. n_outliers: int | numpy.integerNumber of samples in the training set containing outliers. <p>This property is unavailable for sparse datasets. </p> <p></p>"}, {"location": "API/ATOM/atomregressor/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesbranch: BranchCurrent active branch. <p>Use the property's <code>@setter</code> to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use <code>_from_</code> to split the new branch from any other existing branch. Read more in the user guide. models: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/ATOM/atomregressor/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/ATOM/atomregressor/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/ATOM/atomregressor/#utility-methods", "title": "Utility methods", "text": "<p>Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.</p> <p>addAdd a transformer to the pipeline.applyApply a function to the dataset.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoad an atom instance from a pickle file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a <code>.csv</code> file.shrinkConvert the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.</p> <p>If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.</p> <p>Warning</p> <ul> <li>The transformer should have fit and/or transform methods   with arguments <code>X</code> (accepting a dataframe-like object of   shape=(n_samples, n_features)) and/or <code>y</code> (accepting a   sequence of shape=(n_samples,)).</li> <li>The transform method should return a feature set as a   dataframe-like object of shape=(n_samples, n_features)   and/or a target column as a sequence of shape=(n_samples,).</li> </ul> <p>Note</p> <p>If the transform method doesn't return a dataframe:</p> <ul> <li>The column naming happens as follows. If the transformer   has a <code>get_feature_names_out</code> method, it is used. If not,   and it returns the same number of columns, the names are   kept equal. If the number of columns changes, old columns   will keep their name (as long as the column is unchanged)   and new columns will receive the name <code>x[N-1]</code>, where N   stands for the n-th feature. This means that a transformer   should only transform, add or drop columns, not   combinations of these.</li> <li>The index remains the same as before the transformation.   This means that the transformer should not add, remove or   shuffle rows unless it returns a dataframe.</li> </ul> <p>Note</p> <p>If the transformer has a <code>n_jobs</code> and/or <code>random_state</code> parameter that is left to its default value, it adopts atom's value.</p> <p>Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a <code>transform</code> method. <p>columns: int, str, segment, sequence or None, default=None Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns. <p>train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data. <p>**fit_params Additional keyword arguments for the transformer's fit method. </p> <p></p> <p>method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.</p> <p>This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...</p> <p>Note</p> <p>This approach is preferred over changing the dataset directly through the property's <code>@setter</code> since the transformation is stored in the pipeline.</p> <p>Tip</p> <p>Use <code>atom.apply(lambda df: df.drop(\"column_name\",axis=1))</code> to store the removal of columns in the pipeline.</p> <p>Parametersfunc: callable Function to apply with signature <code>func(dataset, **kw_args) -&gt; dataset</code>. <p>inverse_func: callable or None, default=None Inverse function of <code>func</code>. If None, the inverse_transform method returns the input unchanged. <p>kw_args: dict or None, default=None Additional keyword arguments for the function. <p>inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function. </p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.</p> <p>Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.</p> <p>Tip</p> <p>Use the plot_distribution method to plot a column's distribution.</p> <p>Parametersdistributions: str, sequence or None, default=None Names of the distributions in <code>scipy.stats</code> to get the statistics on. If None, a selection of the most common ones is used. <p>columns: int, str, segment, sequence or None, default=None Selection of columns to perform the test on. If None, select all numerical columns. <p>Returnspd.DataFrame Statistic results with multiindex levels: <ul> <li>dist: Name of the distribution.</li> <li>stat: Statistic results:<ul> <li>score: KS-test score.</li> <li>p_value: Corresponding p-value. </li> </ul> </li> </ul> <p></p> <p>method eda(rows=\"dataset\", target=0, filename=None)[source]Create an Exploratory Data Analysis report.</p> <p>ATOM uses the sweetviz package for EDA. The report is rendered directly in the notebook. It can also be accessed through the <code>report</code> attribute. It can either report one dataset or compare two datasets against each other.</p> <p>Warning</p> <p>This method can be slow for large datasets.</p> <p>Parametersrows: str, sequence or dict, default=\"dataset\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to report.</li> <li>If sequence: Names of two data sets to compare.</li> <li>If dict: Names of up to two data sets with corresponding   selection of rows to report.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. Only bool and numerical features can be used as target. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the (html) file to save. If None, don't save anything. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_sample_weight(rows=\"train\")[source]Return sample weights for a balanced data set.</p> <p>The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of <code>y</code> will be multiplied.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsseries Sequence of weights with shape=(n_samples,). </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement an <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>function atom.atom.load(filename, data=None)[source]Load an atom instance from a pickle file.</p> <p>If the instance was saved using <code>save_data=False</code>, it's possible to load new data into it and apply all data transformations.</p> <p>Info</p> <p>The loaded instance's current branch is the same branch as it was when saved.</p> <p>Parametersfilename: str or Path Filename or pathlib.Path of the pickle file. <p>data: tuple of indexables or None, default=None Original dataset as it was provided to the instance's constructor. Only use this parameter if the loaded file was saved using <code>save_data=False</code>. Allowed formats are: <ul> <li>X</li> <li>X, y</li> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).</p> <p>y: int, str or sequence Target column corresponding to `X`.</p> <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsatom Unpickled atom instance. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset(hard=False)[source]Reset the instance to it's initial state.</p> <p>Deletes all branches and models. The dataset is also reset to its form after initialization.</p> <p>Parametershard: bool, default=False If True, flushes completely the cache. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method save_data(filename=\"auto\", rows=\"dataset\", **kwargs)[source]Save the data in the current branch to a <code>.csv</code> file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows to save. <p>**kwargs Additional keyword arguments for pandas' to_csv method. </p> <p></p> <p>method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Convert the columns to the smallest possible matching dtype.</p> <p>Examples are: float64 -&gt; float32, int64 -&gt; int8, etc... Sparse arrays also transform their non-fill value. Use this method for memory optimization before saving the dataset. Note that applying transformers to the data may alter the types again.</p> <p>Parametersint2bool: bool, default=False Whether to convert <code>int</code> columns to <code>bool</code> type. Only if the values in the column are strictly in (0, 1) or (-1, 1). <p>int2uint: bool, default=False Whether to convert <code>int</code> to <code>uint</code> (unsigned integer). Only if the values in the column are strictly positive. <p>str2cat: bool, default=False Whether to convert <code>string</code> to <code>category</code>. Only if the number of categories is less than 30% of the column's length. <p>dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column. <p>columns: int, str, segment, sequence or None, default=None Selection of columns to shrink. If None, transform all columns. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method stats()[source]Display basic information about the dataset.</p> <p></p> <p>method status()[source]Get an overview of the branches and models.</p> <p>This method prints the same information as the __repr__ and also saves it to the logger.</p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be  of use to, for example, transform only the target column.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p> <p></p>"}, {"location": "API/ATOM/atomregressor/#data-cleaning", "title": "Data cleaning", "text": "<p>The data cleaning methods can help you scale the data, handle missing values, categorical columns and outliers. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.</p> <p>Tip</p> <p>Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.</p> <p>cleanApply standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.</p> <p></p> <p>method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Apply standard data cleaning steps on the dataset.</p> <p>Use the parameters to choose which transformations to perform. The available steps are:</p> <ul> <li>Convert dtypes to the best possible types.</li> <li>Drop columns with specific data types.</li> <li>Remove characters from column names.</li> <li>Strip categorical features from spaces.</li> <li>Drop duplicate rows.</li> <li>Drop rows with missing values in the target column.</li> <li>Encode the target column (ignored for regression tasks).</li> </ul> <p>See the Cleaner class for a description of the parameters.</p> <p></p> <p>method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.</p> <p>For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.</p> <p>See the Discretizer class for a description of the parameters.</p> <p>Tip</p> <p>Use the plot_distribution method to visualize a column's distribution and decide on the bins.</p> <p></p> <p>method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.</p> <p>The encoding type depends on the number of classes in the column:</p> <ul> <li>If n_classes=2 or ordinal feature, use Ordinal-encoding.</li> <li>If 2 &lt; n_classes &lt;= <code>max_onehot</code>, use OneHot-encoding.</li> <li>If n_classes &gt; <code>max_onehot</code>, use <code>strategy</code>-encoding.</li> </ul> <p>Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.</p> <p>See the Encoder class for a description of the parameters.</p> <p>Note</p> <p>This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.</p> <p>Tip</p> <p>Use the categorical attribute for a list of the categorical features in the dataset.</p> <p></p> <p>method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.</p> <p>Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the <code>missing</code> attribute to customize what are considered \"missing values\".</p> <p>See the Imputer class for a description of the parameters.</p> <p>Tip</p> <p>Use the nans attribute to check the amount of missing values per column.</p> <p></p> <p>method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.</p> <p>This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.</p> <p>See the Normalizer class for a description of the parameters.</p> <p>Tip</p> <p>Use the plot_distribution method to examine a column's distribution.</p> <p></p> <p>method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.</p> <p>Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.</p> <p>See the Pruner class for a description of the parameters.</p> <p>Note</p> <p>This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.</p> <p>Tip</p> <p>Use the outliers attribute to check the number of outliers per column.</p> <p></p> <p>method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.</p> <p>Apply one of sklearn's scalers. Categorical columns are ignored.</p> <p>See the Scaler class for a description of the parameters.</p> <p>Tip</p> <p>Use the scaled attribute to check whether the dataset is scaled.</p> <p></p> <p></p>"}, {"location": "API/ATOM/atomregressor/#nlp", "title": "NLP", "text": "<p>The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called <code>corpus</code>. Read more in the user guide.</p> <p>textcleanApply standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.</p> <p></p> <p>method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Apply standard text cleaning to the corpus.</p> <p>Transformations include normalizing characters and drop noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised.</p> <p>See the TextCleaner class for a description of the parameters.</p> <p></p> <p>method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.</p> <p>Convert words to a more uniform standard. The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.</p> <p>See the TextNormalizer class for a description of the parameters.</p> <p></p> <p>method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.</p> <p>Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>See the Tokenizer class for a description of the parameters.</p> <p></p> <p>method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.</p> <p>Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix <code>corpus_</code>. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.</p> <p>See the Vectorizer class for a description of the parameters.</p> <p></p> <p></p>"}, {"location": "API/ATOM/atomregressor/#feature-engineering", "title": "Feature engineering", "text": "<p>To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.</p> <p>feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.</p> <p></p> <p>method feature_extraction(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.</p> <p>Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype <code>datetime64</code> are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.</p> <p>See the FeatureExtractor class for a description of the parameters.</p> <p></p> <p>method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.</p> <p>Create new combinations of existing features to capture the non-linear relations between the original features.</p> <p>See the FeatureGenerator class for a description of the parameters.</p> <p></p> <p>method feature_grouping(groups, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.</p> <p>Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the <code>groups</code> method.</p> <p>See the FeatureGrouper class for a description of the parameters.</p> <p>Tip</p> <p>Use a regex pattern with the <code>groups</code> parameter to select groups easier, e.g., <code>atom.feature_grouping({\"group1\": \"var_.+\")</code> to select all features that start with <code>var_</code>.</p> <p></p> <p>method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.</p> <p>Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.</p> <p>See the FeatureSelector class for a description of the parameters.</p> <p>Note</p> <ul> <li>When strategy=\"univariate\" and solver=None, f_classif   or f_regression is used as default solver.</li> <li>When strategy is \"sfs\", \"rfecv\" or any of the   advanced strategies and no scoring is specified,   atom's metric (if it exists) is used as scoring.</li> </ul> <p></p> <p></p>"}, {"location": "API/ATOM/atomregressor/#training", "title": "Training", "text": "<p>The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.</p> <p>runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.</p> <p></p> <p>method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.</p> <p>Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.</p> <p>The following steps are applied to every model:</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the DirectClassifier or DirectRegressor class for a description of the parameters.</p> <p></p> <p>method successive_halving(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.</p> <p>The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g., only using tree-based models.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.</p> <p></p> <p>method train_sizing(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.</p> <p>When training models, there is usually a trade-off between model performance and computation time; that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.</p> <p></p>"}, {"location": "API/branch/branch/", "title": "Branch", "text": "<p>class atom.branch.branch.Branch(name, memory=None, data=None, holdout=None)[source]Object that contains the data.</p> <p>A branch contains a specific pipeline, the dataset transformed through that pipeline, the models fitted on that dataset, and all data and utility attributes that refer to that dataset. Branches can be created and accessed through atom's <code>branch</code> attribute.</p> <p>All public properties and attributes of the branch can be accessed from the parent.</p> <p>Read more in the user guide.</p> <p>Warning</p> <p>This class should not be called directly. Branches are created internally by the ATOMClassifier, ATOMForecaster and ATOMRegressor classes.</p> <p>Parametersname: str Name of the branch. <p>memory: str, Memory or None, default=None Memory object for pipeline caching and to store the data when the branch is inactive. <p>data: DataContainer or None, default=None Data for the branch. <p>holdout: dataframe or None, default=None Holdout data set. <p></p> <p></p> <p>See Also</p> <p>BranchManager Object that manages branches.</p> <p></p>"}, {"location": "API/branch/branch/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Initialize atom\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 177 (1.3%)\n\n\n\n&gt;&gt;&gt; # Train a model\n&gt;&gt;&gt; atom.run(\"RF\")\n\n\nTraining ========================= &gt;&gt;\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9517\nTime elapsed: 0.236s\n-------------------------------------------------\nTime: 0.236s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.239s\n-------------------------------------\nRandomForest --&gt; f1: 0.9517\n\n\n&gt;&gt;&gt; # Change the branch and apply feature scaling\n&gt;&gt;&gt; atom.branch = \"scaled\"\n\nSuccessfully created new branch: scaled.\n\n\n&gt;&gt;&gt; atom.scale()\n\nFitting Scaler...\nScaling features...\n\n&gt;&gt;&gt; atom.run(\"RF_scaled\")\n\n\nTraining ========================= &gt;&gt;\nModels: RF_scaled\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9517\nTime elapsed: 0.237s\n-------------------------------------------------\nTime: 0.237s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.240s\n-------------------------------------\nRandomForest --&gt; f1: 0.9517\n\n\n&gt;&gt;&gt; # Compare the models\n&gt;&gt;&gt; atom.plot_roc()\n</code></pre>"}, {"location": "API/branch/branch/#attributes", "title": "Attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/branch/branch/#methods", "title": "Methods", "text": "<p>loadLoad the branch's data from memory.storeStore the branch's data as a pickle in memory.</p> <p></p> <p>method load(assign=True)[source]Load the branch's data from memory.</p> <p>This method is used to restore the data of inactive branches.</p> <p>Parametersassign: bool, default=True Whether to assign the loaded data to <code>self</code>. <p>ReturnsDataContainer or None Own data information. Returns None if no data is set. </p> <p></p> <p>method store(assign=True)[source]Store the branch's data as a pickle in memory.</p> <p>After storage, the data is deleted, and the branch is no longer usable until load is called. This method is used to store the data for inactive branches.</p> <p>Note</p> <p>This method is skipped silently for branches with no memory allocation.</p> <p>Parametersassign: bool, default=True Whether to assign <code>None</code> to the data in <code>self</code>. </p> <p></p>"}, {"location": "API/branch/branchmanager/", "title": "BranchManager", "text": "<p>class atom.branch.branchmanager.BranchManager(memory=None)[source]Object that manages branches.</p> <p>Maintains references to a series of branches and the current active branch. Additionally, always stores an 'original' branch containing the original dataset (previous to any transformations). The branches share a reference to a holdout set, not the instance self. When a memory object is specified, it stores inactive branches in memory.</p> <p>Read more in the user guide.</p> <p>Warning</p> <p>This class should not be called directly. The BranchManager is created internally by the ATOMClassifier, ATOMForecaster and ATOMRegressor classes.</p> <p>Parametersmemory: str, Memory or None, default=None Location to store inactive branches. If None, all branches are kept in memory. This memory object is passed to the branches for pipeline caching. <p>Attributesbranches: ClassMap Collection of branches. <p>og: Branch Branch containing the original dataset. It can be any branch in <code>branches</code> or an internally made branch called <code>og</code>. <p>current: Branch Current active branch. <p></p> <p></p> <p>See Also</p> <p>Branch Object that contains the data.</p> <p></p>"}, {"location": "API/branch/branchmanager/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Initialize atom\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 174 (1.2%)\n\n\n\n&gt;&gt;&gt; # Train a model\n&gt;&gt;&gt; atom.run(\"RF\")\n\n\nTraining ========================= &gt;&gt;\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9655\nTime elapsed: 0.229s\n-------------------------------------------------\nTime: 0.229s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.232s\n-------------------------------------\nRandomForest --&gt; f1: 0.9655\n\n\n&gt;&gt;&gt; # Change the branch and apply feature scaling\n&gt;&gt;&gt; atom.branch = \"scaled\"\n\nSuccessfully created new branch: scaled.\n\n\n&gt;&gt;&gt; atom.scale()\n\nFitting Scaler...\nScaling features...\n\n&gt;&gt;&gt; atom.run(\"RF_scaled\")\n\n\nTraining ========================= &gt;&gt;\nModels: RF_scaled\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9722\nTime elapsed: 0.228s\n-------------------------------------------------\nTime: 0.228s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.231s\n-------------------------------------\nRandomForest --&gt; f1: 0.9722\n\n\n&gt;&gt;&gt; # Compare the models\n&gt;&gt;&gt; atom.plot_roc()\n</code></pre>"}, {"location": "API/branch/branchmanager/#attributes", "title": "Attributes", "text": "<p>Attributesbranches: ClassMap Collection of branches. <p>og: Branch Branch containing the original dataset. It can be any branch in <code>branches</code> or an internally made branch called <code>og</code>. <p>current: Branch Current active branch. <p></p> <p></p>"}, {"location": "API/branch/branchmanager/#methods", "title": "Methods", "text": "<p>addAdd a new branch to the manager.fillFill the current branch with data.resetReset this instance to its initial state.</p> <p></p> <p>method add(name, parent=None)[source]Add a new branch to the manager.</p> <p>If the branch is called <code>og</code> (reserved name for the original branch), it's created separately and stored in memory.</p> <p>Parametersname: str Name for the new branch. <p>parent: Branch or None, default=None Parent branch. Data and attributes from the parent are passed to the new branch. </p> <p></p> <p>method fill(data, holdout=None)[source]Fill the current branch with data.</p> <p>Parametersdata: DataContainer New data for the current branch. <p>holdout: dataframe or None, default=None Holdout data set (if any). </p> <p></p> <p>method reset(hard=False)[source]Reset this instance to its initial state.</p> <p>The initial state of the BranchManager contains a single branch called <code>main</code> with no data. There's no reference to an original (<code>og</code>) branch.</p> <p>Parametershard: bool, default=False If True, flushes completely the cache. </p> <p></p>"}, {"location": "API/data_cleaning/balancer/", "title": "Balancer", "text": "<p>class atom.data_cleaning.Balancer(strategy=\"ADASYN\", n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs)[source]Balance the number of samples per class in the target column.</p> <p>When oversampling, the newly created samples have an increasing integer index for numerical indices, and an index of the form [estimator]_N for non-numerical indices, where N stands for the N-th sample in the data set. Use only for classification tasks.</p> <p>This class can be accessed from atom through the balance method. Read more in the user guide.</p> <p>Warning</p> <ul> <li>The clustercentroids estimator is unavailable because of    incompatibilities of the APIs.</li> <li>The Balancer class does not support multioutput tasks.</li> </ul> <p>Parametersstrategy: str or estimator, default=\"ADASYN\" Type of algorithm with which to balance the dataset. Choose from the name of any estimator in the imbalanced-learn package or provide a custom instance of such. <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 - value.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p>**kwargs Additional keyword arguments for the <code>strategy</code> estimator. <p>Attributes[strategy]_: imblearn estimator Object (lowercase strategy) used to balance the data, e.g., <code>balancer.adasyn_</code> for the default strategy. <p>mapping_: dict Target values mapped to their respective encoded integers. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>target_names_in_: np.ndarray Names of the target column seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Encoder Perform encoding of categorical features.</p> <p>Imputer Handle missing values in the data.</p> <p>Pruner Prune outliers from the data.</p> <p></p>"}, {"location": "API/data_cleaning/balancer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.train)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630             0.054390         0.1720  ...           107.30       740.4            0.1610            0.42250          0.50300               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690             0.094510         0.1860  ...           142.20      1493.0            0.1492            0.25360          0.37590               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699             0.047440         0.1538  ...           135.10      1320.0            0.1315            0.18060          0.20800               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686             0.027390         0.1852  ...           110.10       931.4            0.1148            0.09866          0.15470               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263             0.023080         0.1305  ...            63.34       270.0            0.1179            0.18790          0.15440               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n451        19.73         19.82          130.70     1206.0          0.10620           0.18490         0.24170             0.097400         0.1733  ...           159.80      1933.0            0.1710            0.59550          0.84890               0.25070          0.2749                  0.12970       0\n452        12.72         13.78           81.78      492.1          0.09667           0.08393         0.01288             0.019240         0.1638  ...            88.54       553.7            0.1298            0.14720          0.05233               0.06343          0.2369                  0.06922       1\n453        11.51         23.93           74.52      403.5          0.09261           0.10210         0.11120             0.041050         0.1388  ...            82.28       474.2            0.1298            0.25170          0.36300               0.09653          0.2112                  0.08732       1\n454        10.75         14.97           68.26      355.3          0.07793           0.05139         0.02251             0.007875         0.1399  ...            77.79       441.2            0.1076            0.12230          0.09755               0.03413          0.2300                  0.06769       1\n455        25.22         24.91          171.50     1878.0          0.10630           0.26650         0.33390             0.184500         0.1829  ...           211.70      2562.0            0.1573            0.60760          0.64760               0.28670          0.2355                  0.10510       0\n\n[456 rows x 31 columns]\n\n\n&gt;&gt;&gt; atom.balance(strategy=\"smote\", verbose=2)\n\nOversampling with SMOTE...\n --&gt; Adding 116 samples to class 0.\n\n\n&gt;&gt;&gt; # Note that the number of rows has increased\n&gt;&gt;&gt; print(atom.train)\n\n     mean radius  mean texture  mean perimeter    mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter   worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0      13.480000     20.820000       88.400000   559.200000         0.101600          0.125500        0.106300             0.054390       0.172000  ...       107.300000   740.400000          0.161000           0.422500         0.503000              0.225800        0.280700                 0.107100       0\n1      18.310000     20.580000      120.800000  1052.000000         0.106800          0.124800        0.156900             0.094510       0.186000  ...       142.200000  1493.000000          0.149200           0.253600         0.375900              0.151000        0.307400                 0.078630       0\n2      17.930000     24.480000      115.200000   998.900000         0.088550          0.070270        0.056990             0.047440       0.153800  ...       135.100000  1320.000000          0.131500           0.180600         0.208000              0.113600        0.250400                 0.079480       0\n3      15.130000     29.810000       96.710000   719.500000         0.083200          0.046050        0.046860             0.027390       0.185200  ...       110.100000   931.400000          0.114800           0.098660         0.154700              0.065750        0.323300                 0.061650       0\n4       8.950000     15.760000       58.740000   245.200000         0.094620          0.124300        0.092630             0.023080       0.130500  ...        63.340000   270.000000          0.117900           0.187900         0.154400              0.038460        0.165200                 0.077220       1\n..           ...           ...             ...          ...              ...               ...             ...                  ...            ...  ...              ...          ...               ...                ...              ...                   ...             ...                      ...     ...\n567    15.182945     22.486774       98.949465   711.386079         0.092513          0.102732        0.113923             0.069481       0.179224  ...       107.689157   826.276172          0.126730           0.199259         0.295172              0.142325        0.265352                 0.068318       0\n568    19.990378     20.622944      130.491182  1253.735467         0.091583          0.117753        0.117236             0.082771       0.202428  ...       167.456689  1995.896044          0.132457           0.289652         0.332006              0.182989        0.299088                 0.084150       0\n569    18.158121     18.928220      119.907435  1027.331092         0.113149          0.147089        0.171862             0.103942       0.209306  ...       135.286302  1319.270051          0.127029           0.233493         0.260138              0.133851        0.302406                 0.079535       0\n570    23.733233     26.433751      158.185672  1724.145541         0.098008          0.193789        0.231158             0.139527       0.188817  ...       207.483796  2844.559632          0.150495           0.463361         0.599077              0.266433        0.290828                 0.091542       0\n571    17.669575     16.375717      115.468589   968.552411         0.093636          0.109983        0.101005             0.075283       0.174505  ...       133.767576  1227.195245          0.118221           0.264624         0.249798              0.135098        0.268044                 0.076533       0\n\n[572 rows x 31 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Balancer\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...          17.33           184.60      2019.0           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...          23.41           158.80      1956.0           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...          25.53           152.50      1709.0           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...          26.50            98.87       567.7           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...          16.67           152.20      1575.0           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...          26.40           166.10      2027.0           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...          38.25           155.00      1731.0           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...          34.12           126.70      1124.0           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...          39.42           184.60      1821.0           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...          30.37            59.16       268.6           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039\n\n[569 rows x 30 columns]\n\n\n&gt;&gt;&gt; balancer = Balancer(strategy=\"smote\", verbose=2)\n&gt;&gt;&gt; X, y = balancer.fit_transform(X, y)\n\nOversampling with SMOTE...\n --&gt; Adding 145 samples to class 0.\n\n\n&gt;&gt;&gt; # Note that the number of rows has increased\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter    mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter   worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0      17.990000     10.380000      122.800000  1001.000000         0.118400          0.277600        0.300100             0.147100       0.241900  ...      17.330000       184.600000  2019.000000          0.162200           0.665600         0.711900              0.265400        0.460100                 0.118900\n1      20.570000     17.770000      132.900000  1326.000000         0.084740          0.078640        0.086900             0.070170       0.181200  ...      23.410000       158.800000  1956.000000          0.123800           0.186600         0.241600              0.186000        0.275000                 0.089020\n2      19.690000     21.250000      130.000000  1203.000000         0.109600          0.159900        0.197400             0.127900       0.206900  ...      25.530000       152.500000  1709.000000          0.144400           0.424500         0.450400              0.243000        0.361300                 0.087580\n3      11.420000     20.380000       77.580000   386.100000         0.142500          0.283900        0.241400             0.105200       0.259700  ...      26.500000        98.870000   567.700000          0.209800           0.866300         0.686900              0.257500        0.663800                 0.173000\n4      20.290000     14.340000      135.100000  1297.000000         0.100300          0.132800        0.198000             0.104300       0.180900  ...      16.670000       152.200000  1575.000000          0.137400           0.205000         0.400000              0.162500        0.236400                 0.076780\n..           ...           ...             ...          ...              ...               ...             ...                  ...            ...  ...            ...              ...          ...               ...                ...              ...                   ...             ...                      ...\n709    19.478557     23.348123      128.995257  1164.950583         0.101810          0.143231        0.194792             0.095794       0.198376  ...      30.482866       143.381227  1362.533650          0.135197           0.267786         0.365230              0.170069        0.273984                 0.076077\n710    18.752895     20.824323      124.472875  1084.317645         0.096491          0.171270        0.177021             0.095356       0.204866  ...      27.544127       160.451305  1623.116663          0.133721           0.506298         0.521417              0.203921        0.348906                 0.098688\n711    17.182368     21.204540      112.271609   925.918840         0.100517          0.110961        0.110803             0.076692       0.204604  ...      28.119577       142.316398  1439.815962          0.155602           0.277795         0.388351              0.207039        0.334574                 0.080310\n712    18.285452     20.578363      120.603613  1048.317740         0.106252          0.125135        0.153635             0.093128       0.188095  ...      26.188544       142.298194  1487.517523          0.147703           0.251890         0.365958              0.150828        0.308848                 0.078435\n713    14.550791     25.918705       96.913441   655.023273         0.111607          0.166865        0.158127             0.077468       0.228924  ...      36.072516       123.641397   930.709825          0.163673           0.659480         0.662486              0.197880        0.423041                 0.132320\n\n[714 rows x 30 columns]\n</code></pre>"}, {"location": "API/data_cleaning/balancer/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformBalance the data.</p> <p></p> <p>method fit(X, y=-1)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, dict or sequence, default=-1 Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=-1)[source]Balance the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str or sequence, default=-1 Target column corresponding to `X`. <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>Else: Array with shape=(n_samples,) to use as target.</li> </ul> <p>Returnsdataframe Balanced dataframe. <p>series Transformed target column. </p> <p></p>"}, {"location": "API/data_cleaning/cleaner/", "title": "Cleaner", "text": "<p>class atom.data_cleaning.Cleaner(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None)[source]Applies standard data cleaning steps on a dataset.</p> <p>Use the parameters to choose which transformations to perform. The available steps are:</p> <ul> <li>Convert dtypes to the best possible types.</li> <li>Drop columns with specific data types.</li> <li>Remove characters from column names.</li> <li>Strip categorical features from spaces.</li> <li>Drop duplicate rows.</li> <li>Drop rows with missing values in the target column.</li> <li>Encode the target column.</li> </ul> <p>This class can be accessed from atom through the clean method. Read more in the user guide.</p> <p>Parametersconvert_dtypes: bool, default=True Convert the column's data types to the best possible types that support <code>pd.NA</code>. <p>drop_dtypes: str, sequence or None, default=None Columns with these data types are dropped from the dataset. <p>drop_chars: str or None, default=None Remove the specified regex pattern from column names, e.g. <code>[^A-Za-z0-9]+</code> to remove all non-alphanumerical characters. <p>strip_categorical: bool, default=True Whether to strip spaces from categorical columns. <p>drop_duplicates: bool, default=False Whether to drop duplicate rows. Only the first occurrence of every duplicated row is kept. <p>drop_missing_target: bool, default=True Whether to drop rows with missing values in the target column. This transformation is ignored if <code>y</code> is not provided. <p>encode_target: bool, default=True Whether to encode the target column(s). This includes converting categorical columns to numerical, and binarizing multilabel columns. This transformation is ignored if <code>y</code> is not provided. <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>Attributesmissing_: list Values that are considered \"missing\". Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. <p>mapping_: dict Target values mapped to their respective encoded integers. Only available if encode_target=True. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>target_names_in_: np.ndarray Names of the target column(s) seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Encoder Perform encoding of categorical features.</p> <p>Discretizer Bin continuous data into intervals.</p> <p>Scaler Scale the data.</p> <p></p>"}, {"location": "API/data_cleaning/cleaner/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n&gt;&gt;&gt; y = [\"a\" if i else \"b\" for i in y]\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.y)\n\n0      a\n1      a\n2      a\n3      a\n4      a\n      ..\n564    a\n565    a\n566    a\n567    a\n568    b\nName: target, Length: 569, dtype: object\n\n\n&gt;&gt;&gt; atom.clean(verbose=2)\n\nFitting Cleaner...\nCleaning the data...\n --&gt; Label-encoding column target.\n\n\n&gt;&gt;&gt; print(atom.y)\n\n0      0\n1      0\n2      0\n3      0\n4      0\n      ..\n564    0\n565    0\n566    0\n567    0\n568    1\nName: target, Length: 569, dtype: Int64\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Cleaner\n&gt;&gt;&gt; from numpy.random import randint\n\n&gt;&gt;&gt; y = [\"a\" if i else \"b\" for i in range(randint(100))]\n\n&gt;&gt;&gt; cleaner = Cleaner(verbose=2)\n&gt;&gt;&gt; y = cleaner.fit_transform(y=y)\n\nFitting Cleaner...\nCleaning the data...\n --&gt; Label-encoding column target.\n\n\n&gt;&gt;&gt; print(y)\n\n0     1\n1     0\n2     0\n3     0\n4     0\n5     0\n6     0\n7     0\n8     0\n9     0\n10    0\n11    0\n12    0\n13    0\n14    0\n15    0\n16    0\n17    0\n18    0\n19    0\n20    0\n21    0\n22    0\n23    0\n24    0\n25    0\n26    0\n27    0\n28    0\n29    0\n30    0\n31    0\n32    0\n33    0\n34    0\n35    0\n36    0\nName: target, dtype: Int64\n</code></pre>"}, {"location": "API/data_cleaning/cleaner/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformInversely transform the label encoding.set_paramsSet the parameters of this estimator.transformApply the data cleaning steps to the data.</p> <p></p> <p>method fit(X=None, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Inversely transform the label encoding.</p> <p>This method only inversely transforms the target encoding. The rest of the transformations can't be inverted. If <code>encode_target=False</code>, the data is returned as is.</p> <p>ParametersX: dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsdataframe Unchanged feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X=None, y=None)[source]Apply the data cleaning steps to the data.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series Transformed target column. Only returned if provided. </p> <p></p>"}, {"location": "API/data_cleaning/discretizer/", "title": "Discretizer", "text": "<p>class atom.data_cleaning.Discretizer(strategy=\"quantile\", bins=5, labels=None, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None)[source]Bin continuous data into intervals.</p> <p>For each feature, the bin edges are computed during fit and, together with the number of bins, they define the intervals. Ignores categorical columns.</p> <p>This class can be accessed from atom through the discretize method. Read more in the user guide.</p> <p>Tip</p> <p>The transformation returns categorical columns. Use the Encoder class to convert them back to numerical types.</p> <p>Parametersstrategy: str, default=\"quantile\" Strategy used to define the widths of the bins. Choose from: <ul> <li>\"uniform\": All bins have identical widths.</li> <li>\"quantile\": All bins have the same number of points.</li> <li>\"kmeans\": Values in each bin have the same nearest center of   a 1D k-means cluster.</li> <li>\"custom\": Use custom bin edges provided through <code>bins</code>.</li> </ul> <p>bins: int, sequence or dict, default=5 Bin number or bin edges in which to split every column. <ul> <li>If int: Number of bins to produce for all columns. Only for   strategy!=\"custom\".</li> <li> <p>If sequence:</p> <ul> <li>For strategy!=\"custom\": Number of bins per column. The   n-th value corresponds to the n-th column that is   transformed. Categorical columns are ignored.</li> <li>For strategy=\"custom\": Bin edges with length=n_bins - 1.   The outermost edges are always <code>-inf</code> and <code>+inf</code>, e.g.,   bins <code>[1, 2]</code> indicate <code>(-inf, 1], (1, 2], (2, inf]</code>.</li> </ul> </li> <li> <p>If dict: One of the aforementioned options per column, where   the key is the column's name. Columns that are not in the   dictionary are not transformed.</p> </li> </ul> <p>labels: sequence, dict or None, default=None Label names with which to replace the binned intervals. <ul> <li>If None: Use default labels of the form <code>(min_edge, max_edge]</code>.</li> <li>If sequence: Labels to use for all columns.</li> <li>If dict: Labels per column, where the key is the column's name.   Columns that are not in the dictionary use the default labels.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. Only for strategy=\"quantile\". <p>Attributesfeature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Encoder Perform encoding of categorical features.</p> <p>Imputer Handle missing values in the data.</p> <p>Normalizer Transform the data to follow a Normal/Gaussian distribution.</p> <p></p>"}, {"location": "API/data_cleaning/discretizer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom[\"mean radius\"])\n\n0      13.48\n1      18.31\n2      17.93\n3      15.13\n4       8.95\n       ...  \n564    14.34\n565    13.17\n566    17.30\n567    17.68\n568    14.80\nName: mean radius, Length: 569, dtype: float64\n\n\n&gt;&gt;&gt; atom.discretize(\n...     strategy=\"custom\",\n...     bins=[13, 18],\n...     labels=[\"small\", \"medium\", \"large\"],\n...     verbose=2,\n...     columns=\"mean radius\",\n... )\n\nFitting Discretizer...\nBinning the features...\n --&gt; Discretizing feature mean radius in 3 bins.\n\n\n&gt;&gt;&gt; print(atom[\"mean radius\"])\n\n0      medium\n1       large\n2      medium\n3      medium\n4       small\n        ...  \n564    medium\n565    medium\n566    medium\n567    medium\n568    medium\nName: mean radius, Length: 569, dtype: category\nCategories (3, object): ['small' &lt; 'medium' &lt; 'large']\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Discretizer\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n&gt;&gt;&gt; print(X[\"mean radius\"])\n\n0      17.99\n1      20.57\n2      19.69\n3      11.42\n4      20.29\n       ...  \n564    21.56\n565    20.13\n566    16.60\n567    20.60\n568     7.76\nName: mean radius, Length: 569, dtype: float64\n\n\n&gt;&gt;&gt; discretizer = Discretizer(\n...     strategy=\"custom\",\n...     bins={\"mean radius\": [13, 18]},\n...     labels=[\"small\", \"medium\", \"large\"],\n...     verbose=2,\n... )\n&gt;&gt;&gt; X = discretizer.fit_transform(X)\n\nFitting Discretizer...\nBinning the features...\n --&gt; Discretizing feature mean radius in 3 bins.\n\n\n&gt;&gt;&gt; print(X[\"mean radius\"])\n\n0      medium\n1       large\n2       large\n3       small\n4       large\n        ...  \n564     large\n565     large\n566    medium\n567     large\n568     small\nName: mean radius, Length: 569, dtype: category\nCategories (3, object): ['small' &lt; 'medium' &lt; 'large']\n</code></pre>"}, {"location": "API/data_cleaning/discretizer/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformBin the data into intervals.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Bin the data into intervals.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed feature set. </p> <p></p>"}, {"location": "API/data_cleaning/encoder/", "title": "Encoder", "text": "<p>class atom.data_cleaning.Encoder(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"infrequent\", n_jobs=1, verbose=0, logger=None, **kwargs)[source]Perform encoding of categorical features.</p> <p>The encoding type depends on the number of classes in the column:</p> <ul> <li>If n_classes=2 or ordinal feature, use Ordinal-encoding.</li> <li>If 2 &lt; n_classes &lt;= <code>max_onehot</code>, use OneHot-encoding.</li> <li>If n_classes &gt; <code>max_onehot</code>, use <code>strategy</code>-encoding.</li> </ul> <p>Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Infrequent classes can be replaced with a value in order to prevent too high cardinality.</p> <p>This class can be accessed from atom through the encode method. Read more in the user guide.</p> <p>Warning</p> <p>Three category-encoders estimators are unavailable:</p> <ul> <li>OneHotEncoder: Use the max_onehot parameter.</li> <li>HashingEncoder: Incompatibility of APIs.</li> <li>LeaveOneOutEncoder: Incompatibility of APIs.</li> </ul> <p>Parametersstrategy: str or estimator, default=\"Target\" Type of encoding to use for high cardinality features. Choose from any of the estimators in the category-encoders package or provide a custom one. <p>max_onehot: int or None, default=10 Maximum number of unique values in a feature to perform one-hot encoding. If None, <code>strategy</code>-encoding is always used for columns with more than two classes. <p>ordinal: dict or None, default=None Order of ordinal features, where the dict key is the feature's name and the value is the class order, e.g., <code>{\"salary\": [\"low\", \"medium\", \"high\"]}</code>. <p>infrequent_to_value: int, float or None, default=None Replaces infrequent class occurrences in categorical columns with the string in parameter <code>value</code>. This transformation is done before the encoding of the column. <ul> <li>If None: Skip this step.</li> <li>If int: Minimum number of occurrences in a class.</li> <li>If float: Minimum fraction of occurrences in a class.</li> </ul> <p>value: str, default=\"infrequent\" Value with which to replace rare classes. This parameter is ignored if <code>infrequent_to_value=None</code>. <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 - value.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>**kwargs Additional keyword arguments for the <code>strategy</code> estimator. <p>Attributesmapping_: dict of dicts Encoded values and their respective mapping. The column name is the key to its mapping dictionary. Only for ordinal encoding. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Cleaner Applies standard data cleaning steps on a dataset.</p> <p>Imputer Handle missing values in the data.</p> <p>Pruner Prune outliers from the data.</p> <p></p>"}, {"location": "API/data_cleaning/encoder/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n&gt;&gt;&gt; from numpy.random import randint\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n&gt;&gt;&gt; X[\"cat_feature_1\"] = [f\"x{i}\" for i in randint(0, 2, len(X))]\n&gt;&gt;&gt; X[\"cat_feature_2\"] = [f\"x{i}\" for i in randint(0, 3, len(X))]\n&gt;&gt;&gt; X[\"cat_feature_3\"] = [f\"x{i}\" for i in randint(0, 20, len(X))]\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2  cat_feature_3\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710             x0             x1            x17\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863             x0             x0            x15\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948             x1             x0            x16\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165             x0             x0            x13\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722             x0             x1            x11\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...               ...                ...              ...                   ...             ...                      ...            ...            ...            ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072             x0             x2            x11\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618             x1             x1             x5\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113             x0             x1            x17\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738             x0             x0             x2\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285             x0             x2            x14\n\n[569 rows x 33 columns]\n\n\n&gt;&gt;&gt; atom.encode(strategy=\"target\", max_onehot=10, verbose=2)\n\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Ordinal-encoding feature cat_feature_1. Contains 2 classes.\n --&gt; OneHot-encoding feature cat_feature_2. Contains 3 classes.\n --&gt; Target-encoding feature cat_feature_3. Contains 20 classes.\n\n\n&gt;&gt;&gt; # Note the one-hot encoded column with name [feature]_[class]\n&gt;&gt;&gt; print(atom.X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2_x1  cat_feature_2_x0  cat_feature_2_x2  cat_feature_3\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           0.5030               0.22580          0.2807                  0.10710            0.0               1.0               0.0               0.0       0.622917\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           0.3759               0.15100          0.3074                  0.07863            0.0               0.0               1.0               0.0       0.619953\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           0.2080               0.11360          0.2504                  0.07948            1.0               0.0               1.0               0.0       0.636924\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           0.1547               0.06575          0.3233                  0.06165            0.0               0.0               1.0               0.0       0.585368\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...           0.1544               0.03846          0.1652                  0.07722            0.0               1.0               0.0               0.0       0.638596\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...                   ...             ...                      ...            ...               ...               ...               ...            ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           0.1632               0.10870          0.3062                  0.06072            0.0               0.0               0.0               1.0       0.638596\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           0.3728               0.16070          0.3693                  0.09618            1.0               1.0               0.0               0.0       0.588596\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           0.3378               0.18570          0.3138                  0.08113            0.0               1.0               0.0               0.0       0.622917\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           0.3583               0.15150          0.2463                  0.07738            0.0               0.0               1.0               0.0       0.688596\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           0.2060               0.08308          0.3600                  0.07285            0.0               0.0               0.0               1.0       0.662643\n\n[569 rows x 35 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Encoder\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n&gt;&gt;&gt; from numpy.random import randint\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n&gt;&gt;&gt; X[\"cat_feature_1\"] = [f\"x{i}\" for i in randint(0, 2, len(X))]\n&gt;&gt;&gt; X[\"cat_feature_2\"] = [f\"x{i}\" for i in randint(0, 3, len(X))]\n&gt;&gt;&gt; X[\"cat_feature_3\"] = [f\"x{i}\" for i in randint(0, 20, len(X))]\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2  cat_feature_3\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890             x1             x2             x5\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902             x1             x2            x13\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758             x0             x0            x15\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300             x0             x2            x10\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678             x1             x1            x17\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...               ...                ...              ...                   ...             ...                      ...            ...            ...            ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115             x1             x1            x12\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637             x0             x2            x14\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820             x0             x1             x3\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400             x1             x0             x2\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039             x1             x1            x11\n\n[569 rows x 33 columns]\n\n\n&gt;&gt;&gt; encoder = Encoder(strategy=\"target\", max_onehot=10, verbose=2)\n&gt;&gt;&gt; X = encoder.fit_transform(X, y)\n\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Ordinal-encoding feature cat_feature_1. Contains 2 classes.\n --&gt; OneHot-encoding feature cat_feature_2. Contains 3 classes.\n --&gt; Target-encoding feature cat_feature_3. Contains 20 classes.\n\n\n&gt;&gt;&gt; # Note the one-hot encoded column with name [feature]_[class]\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2_x2  cat_feature_2_x0  cat_feature_2_x1  cat_feature_3\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...           0.7119                0.2654          0.4601                  0.11890            1.0               1.0               0.0               0.0       0.645086\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...           0.2416                0.1860          0.2750                  0.08902            1.0               1.0               0.0               0.0       0.604148\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...           0.4504                0.2430          0.3613                  0.08758            0.0               0.0               1.0               0.0       0.675079\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...           0.6869                0.2575          0.6638                  0.17300            0.0               1.0               0.0               0.0       0.706297\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...           0.4000                0.1625          0.2364                  0.07678            1.0               0.0               0.0               1.0       0.716566\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...                   ...             ...                      ...            ...               ...               ...               ...            ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...           0.4107                0.2216          0.2060                  0.07115            1.0               0.0               0.0               1.0       0.598024\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...           0.3215                0.1628          0.2572                  0.06637            0.0               1.0               0.0               0.0       0.683185\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...           0.3403                0.1418          0.2218                  0.07820            0.0               0.0               0.0               1.0       0.472908\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...           0.9387                0.2650          0.4087                  0.12400            1.0               0.0               1.0               0.0       0.585452\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...           0.0000                0.0000          0.2871                  0.07039            1.0               0.0               0.0               1.0       0.516759\n\n[569 rows x 35 columns]\n</code></pre>"}, {"location": "API/data_cleaning/encoder/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformEncode the data.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>Note that leaving y=None can lead to errors if the <code>strategy</code> encoder requires target values. For multioutput tasks, only the first target column is used to fit the encoder.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, dict, sequence or dataframe-like Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Encode the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Encoded dataframe. </p> <p></p>"}, {"location": "API/data_cleaning/imputer/", "title": "Imputer", "text": "<p>class atom.data_cleaning.Imputer(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None)[source]Handle missing values in the data.</p> <p>Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the <code>missing</code> attribute to customize what are considered \"missing values\".</p> <p>This class can be accessed from atom through the impute method. Read more in the user guide.</p> <p>Parametersstrat_num: str, int or float, default=\"drop\" Imputing strategy for numerical columns. Choose from: <ul> <li>\"drop\": Drop rows containing missing values.</li> <li>\"mean\": Impute with mean of column.</li> <li>\"median\": Impute with median of column.</li> <li>\"knn\": Impute using a K-Nearest Neighbors approach.</li> <li>\"iterative\": Impute using a multivariate imputer.</li> <li>\"most_frequent\": Impute with the most frequent value.</li> <li>int or float: Impute with provided numerical value.</li> </ul> <p>strat_cat: str, default=\"drop\" Imputing strategy for categorical columns. Choose from: <ul> <li>\"drop\": Drop rows containing missing values.</li> <li>\"most_frequent\": Impute with the most frequent value.</li> <li>str: Impute with provided string.</li> </ul> <p>max_nan_rows: int, float or None, default=None Maximum number or fraction of missing values in a row (if more, the row is removed). If None, ignore this step. <p>max_nan_cols: int, float or None, default=None Maximum number or fraction of missing values in a column (if more, the column is removed). If None, ignore this step. <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 - value.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. Only used when strat_num=\"iterative\". <p>Attributesmissing_: list Values that are considered \"missing\". Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Balancer Balance the number of samples per class in the target column.</p> <p>Discretizer Bin continuous data into intervals.</p> <p>Encoder Perform encoding of categorical features.</p> <p></p>"}, {"location": "API/data_cleaning/imputer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from numpy.random import randint\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Add some random missing values to the data\n&gt;&gt;&gt; for i, j in zip(randint(0, X.shape[0], 600), randint(0, 4, 600)):\n...     X.iat[i, j] = np.NaN\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.nans)\n\nmean radius                130\nmean texture               141\nmean perimeter             124\nmean area                  136\nmean smoothness              0\nmean compactness             0\nmean concavity               0\nmean concave points          0\nmean symmetry                0\nmean fractal dimension       0\nradius error                 0\ntexture error                0\nperimeter error              0\narea error                   0\nsmoothness error             0\ncompactness error            0\nconcavity error              0\nconcave points error         0\nsymmetry error               0\nfractal dimension error      0\nworst radius                 0\nworst texture                0\nworst perimeter              0\nworst area                   0\nworst smoothness             0\nworst compactness            0\nworst concavity              0\nworst concave points         0\nworst symmetry               0\nworst fractal dimension      0\ndtype: int64\n\n\n&gt;&gt;&gt; atom.impute(strat_num=\"median\", max_nan_rows=0.1, verbose=2)\n\nFitting Imputer...\nImputing missing values...\n --&gt; Imputing 130 missing values with median (13.27) in feature mean radius.\n --&gt; Imputing 141 missing values with median (18.87) in feature mean texture.\n --&gt; Imputing 124 missing values with median (85.66) in feature mean perimeter.\n --&gt; Imputing 136 missing values with median (555.1) in feature mean area.\n\n\n&gt;&gt;&gt; print(atom.n_nans)\n\n0\n</code></pre> <pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom.data_cleaning import Imputer\n&gt;&gt;&gt; from numpy.random import randint\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Add some random missing values to the data\n&gt;&gt;&gt; for i, j in zip(randint(0, X.shape[0], 600), randint(0, 4, 600)):\n...     X.iloc[i, j] = np.nan\n\n&gt;&gt;&gt; imputer = Imputer(strat_num=\"median\", max_nan_rows=0.1, verbose=2)\n&gt;&gt;&gt; X, y = imputer.fit_transform(X, y)\n\nFitting Imputer...\nImputing missing values...\n --&gt; Dropping 2 samples for containing more than 3 missing values.\n --&gt; Imputing 124 missing values with median (13.38) in feature mean radius.\n --&gt; Imputing 127 missing values with median (18.87) in feature mean texture.\n --&gt; Imputing 137 missing values with median (86.54) in feature mean perimeter.\n --&gt; Imputing 134 missing values with median (561.3) in feature mean area.\n\n\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0          13.38        10.380         122.800     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...          17.33           184.60      2019.0           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890\n1          20.57        17.770          86.545      561.3          0.08474           0.07864         0.08690              0.07017         0.1812  ...          23.41           158.80      1956.0           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902\n2          19.69        21.250         130.000     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...          25.53           152.50      1709.0           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758\n3          11.42        20.380          77.580      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...          26.50            98.87       567.7           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300\n4          13.38        14.340         135.100     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...          16.67           152.20      1575.0           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564        21.56        22.390          86.545      561.3          0.11100           0.11590         0.24390              0.13890         0.1726  ...          26.40           166.10      2027.0           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115\n565        20.13        18.865         131.200     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...          38.25           155.00      1731.0           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637\n566        13.38        28.080          86.545      561.3          0.08455           0.10230         0.09251              0.05302         0.1590  ...          34.12           126.70      1124.0           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820\n567        20.60        29.330         140.100     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...          39.42           184.60      1821.0           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400\n568        13.38        24.540          47.920      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...          30.37            59.16       268.6           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039\n\n[567 rows x 30 columns]\n</code></pre>"}, {"location": "API/data_cleaning/imputer/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformImpute the missing values.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Impute the missing values.</p> <p>Note that leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsdataframe Imputed dataframe. <p>series Transformed target column. Only returned if provided. </p> <p></p>"}, {"location": "API/data_cleaning/normalizer/", "title": "Normalizer", "text": "<p>class atom.data_cleaning.Normalizer(strategy=\"yeojohnson\", device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None, **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.</p> <p>This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Categorical columns are ignored.</p> <p>This class can be accessed from atom through the normalize method. Read more in the user guide.</p> <p>Warning</p> <p>The quantile strategy performs a non-linear transformation. This may distort linear correlations between variables measured at the same scale but renders variables measured at different scales more directly comparable.</p> <p>Note</p> <p>The yeojohnson and boxcox strategies scale the data after transforming. Use the <code>kwargs</code> to change this behavior.</p> <p>Parametersstrategy: str, default=\"yeojohnson\" The transforming strategy. Choose from: <ul> <li>\"yeojohnson\"</li> <li>\"boxcox\" (only works with strictly positive values)</li> <li>\"quantile\": Transform features using quantiles information.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>random_state: int or None, default=None Seed used by the quantile strategy. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p>**kwargs Additional keyword arguments for the <code>strategy</code> estimator. <p>Attributes[strategy]_: sklearn transformer Object with which the data is transformed, e.g., <code>normalizer.yeojohnson</code> for the default strategy. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Cleaner Applies standard data cleaning steps on a dataset.</p> <p>Pruner Prune outliers from the data.</p> <p>Scaler Scale the data.</p> <p></p>"}, {"location": "API/data_cleaning/normalizer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            63.34       270.0            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[569 rows x 31 columns]\n\n\n&gt;&gt;&gt; atom.plot_distribution(columns=0)\n</code></pre> <pre><code>&gt;&gt;&gt; atom.normalize(verbose=2)\n\nFitting Normalizer...\nNormalizing features...\n\n\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0      -0.017068      0.464087        0.031104  -0.020222         0.390628          0.620790        0.562136             0.426774      -0.280554  ...         0.251532    0.081524          1.224389           1.206519         1.189835              1.522769       -0.043007                 1.378960       0\n1       1.182066      0.411242        1.183030   1.200556         0.741209          0.608244        1.100342             1.256472       0.256014  ...         1.119375    1.218096          0.759546           0.244492         0.726989              0.650523        0.424017                -0.164104       0\n2       1.105309      1.197684        1.018344   1.106437        -0.552214         -0.652544       -0.230044             0.226950      -1.050816  ...         0.973194    1.037232          0.002307          -0.374986        -0.128679              0.107299       -0.647198                -0.100126       0\n3       0.455144      2.077941        0.379512   0.486019        -0.966587         -1.447057       -0.438308            -0.480189       0.226570  ...         0.337722    0.483003         -0.785100          -1.301043        -0.483292             -0.722786        0.676588                -1.783846       0\n4      -1.898537     -0.815757       -1.745528  -1.873415        -0.102067          0.599235        0.374346            -0.662103      -2.173761  ...        -1.869111   -2.095123         -0.633206          -0.305478        -0.485431             -1.278472       -2.898859                -0.273347       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564     0.238929     -1.546154        0.209113   0.257899         0.214334         -0.482480       -0.225132             0.183841       0.996371  ...         0.346743    0.373205         -0.079012          -0.660736        -0.423384              0.029761        0.404215                -1.894769       1\n565    -0.115233      0.675396       -0.105672  -0.125511         0.078814          0.213069        0.222118             0.375009      -0.177404  ...         0.194134    0.082260          0.804177           1.061384         0.714032              0.778530        1.315113                 0.913117       0\n566     0.972621     -0.443853        0.950416   0.971288         0.335466          0.200161        0.804757             1.074782       0.080964  ...         0.880583    0.920102          0.443592           0.144776         0.561298              1.086695        0.527842                 0.020173       0\n567     1.053489      0.446545        1.084407   1.040647         1.046541          1.237987        1.321388             1.410770       0.650180  ...         0.925288    1.016604          0.452080           0.855688         0.652219              0.657243       -0.735710                -0.260751       0\n568     0.366875     -0.289945        0.346701   0.359700        -0.309357         -0.150999       -0.574459            -0.683107       0.375972  ...         0.207028    0.284140         -0.407994          -0.303600        -0.141124             -0.402554        1.196110                -0.638106       1\n\n[569 rows x 31 columns]\n\n\n&gt;&gt;&gt; atom.plot_distribution(columns=0)\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Normalizer\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; normalizer = Normalizer(verbose=2)\n&gt;&gt;&gt; X = normalizer.fit_transform(X)\n\nFitting Normalizer...\nNormalizing features...\n\n\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0       1.134881     -2.678666        1.259822   1.126421         1.504114          2.165938        1.862988             1.848558       1.953067  ...      -1.488367         1.810506    1.652210          1.282792           1.942737         1.730182              1.935654        2.197206                 1.723624\n1       1.619346     -0.264377        1.528723   1.633946        -0.820227         -0.384102        0.291976             0.820609       0.102291  ...      -0.288382         1.430616    1.610022         -0.325080          -0.296580         0.070746              1.101594       -0.121997                 0.537179\n2       1.464796      0.547806        1.454664   1.461645         0.963977          1.163977        1.403673             1.683104       0.985668  ...       0.071406         1.321941    1.425307          0.580301           1.209701         1.005512              1.722744        1.218181                 0.453955\n3      -0.759262      0.357721       -0.514886  -0.836238         2.781494          2.197843        1.642391             1.423004       2.360528  ...       0.228089        -0.039480   -0.436860          2.857821           2.282276         1.675087              1.862378        3.250202                 2.517606\n4       1.571260     -1.233520        1.583340   1.595120         0.343932          0.762392        1.407479             1.410929       0.090964  ...      -1.637882         1.316582    1.309486          0.284367          -0.131829         0.817474              0.807077       -0.943554                -0.279402\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564     1.781795      0.785604        1.746492   1.823030         1.052829          0.460810        1.653784             1.783067      -0.232645  ...       0.212151         1.547961    1.657442          0.438013          -0.077871         0.859079              1.503734       -1.721528                -0.751459\n565     1.543335      1.845150        1.485601   1.545430         0.168014          0.207602        0.984746             1.320730      -0.129120  ...       1.832201         1.365939    1.443167         -0.667317          -0.245277         0.480804              0.810995       -0.480093                -1.210527\n566     0.828589      1.817618        0.811329   0.835270        -0.835509          0.183969        0.375105             0.396882      -0.808189  ...       1.320625         0.786129    0.796192         -0.799337           0.626487         0.566826              0.526136       -1.301164                -0.170872\n567     1.624440      2.016299        1.702747   1.551036         1.468642          2.162820        1.994466             1.884414       1.899087  ...       1.968949         1.810506    1.513198          1.387135           2.284642         2.136932              1.931990        1.744693                 1.850944\n568    -2.699432      1.203224       -2.827766  -2.703256        -3.834325         -1.481409       -1.658319            -1.845392      -0.821560  ...       0.810681        -2.231436   -2.149403         -2.064647          -1.731936        -1.819966             -2.131070        0.103122                -0.820663\n\n[569 rows x 30 columns]\n</code></pre>"}, {"location": "API/data_cleaning/normalizer/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformApply the inverse transformation to the data.set_paramsSet the parameters of this estimator.transformApply the transformations to the data.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X, y=None)[source]Apply the inverse transformation to the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Original dataframe. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Apply the transformations to the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Normalized dataframe. </p> <p></p>"}, {"location": "API/data_cleaning/pruner/", "title": "Pruner", "text": "<p>class atom.data_cleaning.Pruner(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]Prune outliers from the data.</p> <p>Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.</p> <p>This class can be accessed from atom through the prune method. Read more in the user guide.</p> <p>Info</p> <p>The \"sklearnex\" and \"cuml\" engines are only supported for strategy=\"dbscan\".</p> <p>Parametersstrategy: str or sequence, default=\"zscore\" Strategy with which to select the outliers. If sequence of strategies, only samples marked as outliers by all chosen strategies are dropped. Choose from: <ul> <li>\"zscore\": Z-score of each data value.</li> <li>\"iforest\": Isolation Forest.</li> <li>\"ee\": Elliptic Envelope.</li> <li>\"lof\": Local Outlier Factor.</li> <li>\"svm\": One-class SVM.</li> <li>\"dbscan\": Density-Based Spatial Clustering.</li> <li>\"hdbscan\": Hierarchical Density-Based Spatial Clustering.</li> <li>\"optics\": DBSCAN-like clustering approach.</li> </ul> <p>method: int, float or str, default=\"drop\" Method to apply on the outliers. Only the zscore strategy accepts another method than \"drop\". Choose from: <ul> <li>\"drop\": Drop any sample with outlier values.</li> <li>\"minmax\": Replace outlier with the min/max of the column.</li> <li>Any numerical value with which to replace the outliers.</li> </ul> <p>max_sigma: int or float, default=3 Maximum allowed standard deviations from the mean of the column. If more, it is considered an outlier. Only if strategy=\"zscore\". <p>include_target: bool, default=False Whether to include the target column in the search for outliers. This can be useful for regression tasks. Only if strategy=\"zscore\". <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>**kwargs Additional keyword arguments for the <code>strategy</code> estimator. If sequence of strategies, the params should be provided in a dict with the strategy's name as key. <p>Attributes[strategy]_: sklearn estimator Object used to prune the data, e.g., <code>pruner.iforest</code> for the isolation forest strategy. Not available for strategy=\"zscore\". <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Balancer Balance the number of samples per class in the target column.</p> <p>Normalizer Transform the data to follow a Normal/Gaussian distribution.</p> <p>Scaler Scale the data.</p> <p></p>"}, {"location": "API/data_cleaning/pruner/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            63.34       270.0            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[569 rows x 31 columns]\n\n\n&gt;&gt;&gt; atom.prune(stratgey=\"iforest\", verbose=2)\n\nFitting Pruner...\nPruning outliers...\n --&gt; Dropping 63 outliers.\n\n\n&gt;&gt;&gt; # Note the reduced number of rows\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4          10.26         16.58           65.85      320.8          0.08877           0.08066         0.04358              0.02438         0.1669  ...            71.08       357.4            0.1461            0.22460           0.1783               0.08333          0.2691                  0.09479       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n501        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n502        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n503        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n504        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n505        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[506 rows x 31 columns]\n\n\n&gt;&gt;&gt; atom.plot_distribution(columns=0)\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Normalizer\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; normalizer = Normalizer(verbose=2)\n&gt;&gt;&gt; X = normalizer.fit_transform(X)\n\nFitting Normalizer...\nNormalizing features...\n\n\n&gt;&gt;&gt; # Note the reduced number of rows\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0       1.134881     -2.678666        1.259822   1.126421         1.504114          2.165938        1.862988             1.848558       1.953067  ...      -1.488367         1.810506    1.652210          1.282792           1.942737         1.730182              1.935654        2.197206                 1.723624\n1       1.619346     -0.264377        1.528723   1.633946        -0.820227         -0.384102        0.291976             0.820609       0.102291  ...      -0.288382         1.430616    1.610022         -0.325080          -0.296580         0.070746              1.101594       -0.121997                 0.537179\n2       1.464796      0.547806        1.454664   1.461645         0.963977          1.163977        1.403673             1.683104       0.985668  ...       0.071406         1.321941    1.425307          0.580301           1.209701         1.005512              1.722744        1.218181                 0.453955\n3      -0.759262      0.357721       -0.514886  -0.836238         2.781494          2.197843        1.642391             1.423004       2.360528  ...       0.228089        -0.039480   -0.436860          2.857821           2.282276         1.675087              1.862378        3.250202                 2.517606\n4       1.571260     -1.233520        1.583340   1.595120         0.343932          0.762392        1.407479             1.410929       0.090964  ...      -1.637882         1.316582    1.309486          0.284367          -0.131829         0.817474              0.807077       -0.943554                -0.279402\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564     1.781795      0.785604        1.746492   1.823030         1.052829          0.460810        1.653784             1.783067      -0.232645  ...       0.212151         1.547961    1.657442          0.438013          -0.077871         0.859079              1.503734       -1.721528                -0.751459\n565     1.543335      1.845150        1.485601   1.545430         0.168014          0.207602        0.984746             1.320730      -0.129120  ...       1.832201         1.365939    1.443167         -0.667317          -0.245277         0.480804              0.810995       -0.480093                -1.210527\n566     0.828589      1.817618        0.811329   0.835270        -0.835509          0.183969        0.375105             0.396882      -0.808189  ...       1.320625         0.786129    0.796192         -0.799337           0.626487         0.566826              0.526136       -1.301164                -0.170872\n567     1.624440      2.016299        1.702747   1.551036         1.468642          2.162820        1.994466             1.884414       1.899087  ...       1.968949         1.810506    1.513198          1.387135           2.284642         2.136932              1.931990        1.744693                 1.850944\n568    -2.699432      1.203224       -2.827766  -2.703256        -3.834325         -1.481409       -1.658319            -1.845392      -0.821560  ...       0.810681        -2.231436   -2.149403         -2.064647          -1.731936        -1.819966             -2.131070        0.103122                -0.820663\n\n[569 rows x 30 columns]\n</code></pre>"}, {"location": "API/data_cleaning/pruner/#methods", "title": "Methods", "text": "<p>fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformApply the outlier strategy on the data.</p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Do nothing.</p> <p>Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Apply the outlier strategy on the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsdataframe Transformed feature set. <p>series Transformed target column. Only returned if provided. </p> <p></p>"}, {"location": "API/data_cleaning/scaler/", "title": "Scaler", "text": "<p>class atom.data_cleaning.Scaler(strategy=\"standard\", include_binary=False, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]Scale the data.</p> <p>Apply one of sklearn's scalers. Categorical columns are ignored.</p> <p>This class can be accessed from atom through the scale method. Read more in the user guide.</p> <p>Parametersstrategy: str, default=\"standard\" Strategy with which to scale the data. Choose from: <ul> <li>\"standard\": Remove mean and scale to unit variance.</li> <li>\"minmax\": Scale features to a given range.</li> <li>\"maxabs\": Scale features by their maximum absolute value.</li> <li>\"robust\": Scale using statistics that are robust to outliers.</li> </ul> <p>include_binary: bool, default=False Whether to scale binary columns (only 0s and 1s). <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>**kwargs Additional keyword arguments for the <code>strategy</code> estimator. <p>Attributes[strategy]_: sklearn transformer Object with which the data is scaled, e.g., <code>scaler.standard</code> for the default strategy. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Balancer Balance the number of samples per class in the target column.</p> <p>Normalizer Transform the data to follow a Normal/Gaussian distribution.</p> <p>Scaler Scale the data.</p> <p></p>"}, {"location": "API/data_cleaning/scaler/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            63.34       270.0            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[569 rows x 31 columns]\n\n\n&gt;&gt;&gt; atom.scale(verbose=2)\n\nFitting Scaler...\nScaling features...\n\n\n&gt;&gt;&gt; # Note the reduced number of rows\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0      -0.181875      0.356669       -0.147122  -0.270991         0.340268          0.381628        0.214571             0.125567      -0.345050  ...         0.000933   -0.246244          1.240292           1.077359         1.116229              1.667157       -0.162964                 1.326816       0\n1       1.162216      0.300578        1.159704   1.097856         0.707625          0.368288        0.852572             1.148598       0.172744  ...         1.025723    1.042996          0.719898          -0.011475         0.500961              0.537309        0.280594                -0.308640       0\n2       1.056470      1.212060        0.933833   0.950360        -0.581659         -0.670877       -0.407166            -0.051653      -1.018183  ...         0.817241    0.746639         -0.060694          -0.482078        -0.311813             -0.027615       -0.666328                -0.259812       0\n3       0.277287      2.457753        0.188054   0.174273        -0.959614         -1.132432       -0.534892            -0.562913       0.143156  ...         0.083151    0.080948         -0.797185          -1.010314        -0.569828             -0.750385        0.544735                -1.284055       0\n4      -1.442482     -0.825921       -1.343434  -1.143186        -0.152840          0.358760        0.042209            -0.672815      -1.879941  ...        -1.289891   -1.052061         -0.660471          -0.435018        -0.571280             -1.162598       -2.081728                -0.389638       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564     0.057446     -1.361124        0.018651  -0.043220         0.160827         -0.557108       -0.404013            -0.087607       0.967929  ...         0.091960   -0.018751         -0.140077          -0.663228        -0.528681             -0.101629        0.260659                -1.337478       1\n565    -0.268141      0.588045       -0.267318  -0.347933         0.025188         -0.014753       -0.084382             0.077883      -0.248889  ...        -0.051921   -0.245730          0.768409           0.870422         0.485954              0.683827        1.308918                 0.699518       0\n566     0.881154     -0.517419        0.845098   0.753978         0.283751         -0.026187        0.470528             0.868616      -0.001087  ...         0.693914    0.578760          0.384728          -0.095926         0.316526              1.061450        0.386915                -0.165028       0\n567     0.986900      0.337972        1.022568   0.852586         1.039660          1.162956        1.213182             1.426285       0.583281  ...         0.752641    0.715804          0.393548           0.608690         0.415763              0.544861       -0.734440                -0.380446       0\n568     0.185455     -0.381865        0.154577   0.050111        -0.352767         -0.315850       -0.612688            -0.685055       0.294796  ...        -0.040176   -0.093611         -0.453195          -0.433728        -0.321494             -0.488617        1.154420                -0.640672       1\n\n[569 rows x 31 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Scaler\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; scaler = Scaler(verbose=2)\n&gt;&gt;&gt; X = scaler.fit_transform(X)\n\nFitting Scaler...\nScaling features...\n\n\n&gt;&gt;&gt; # Note the reduced number of rows\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0       1.097064     -2.073335        1.269934   0.984375         1.568466          3.283515        2.652874             2.532475       2.217515  ...      -1.359293         2.303601    2.001237          1.307686           2.616665         2.109526              2.296076        2.750622                 1.937015\n1       1.829821     -0.353632        1.685955   1.908708        -0.826962         -0.487072       -0.023846             0.548144       0.001392  ...      -0.369203         1.535126    1.890489         -0.375612          -0.430444        -0.146749              1.087084       -0.243890                 0.281190\n2       1.579888      0.456187        1.566503   1.558884         0.942210          1.052926        1.363478             2.037231       0.939685  ...      -0.023974         1.347475    1.456285          0.527407           1.082932         0.854974              1.955000        1.152255                 0.201391\n3      -0.768909      0.253732       -0.592687  -0.764464         3.283553          3.402909        1.915897             1.451707       2.867383  ...       0.133984        -0.249939   -0.550021          3.394275           3.893397         1.989588              2.175786        6.046041                 4.935010\n4       1.750297     -1.151816        1.776573   1.826229         0.280372          0.539340        1.371011             1.428493      -0.009560  ...      -1.466770         1.338539    1.220724          0.220556          -0.313395         0.613179              0.729259       -0.868353                -0.397100\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564     2.110995      0.721473        2.060786   2.343856         1.041842          0.219060        1.947285             2.320965      -0.312589  ...       0.117700         1.752563    2.015301          0.378365          -0.273318         0.664512              1.629151       -1.360158                -0.709091\n565     1.704854      2.085134        1.615931   1.723842         0.102458         -0.017833        0.693043             1.263669      -0.217664  ...       2.047399         1.421940    1.494959         -0.691230          -0.394820         0.236573              0.733827       -0.531855                -0.973978\n566     0.702284      2.045574        0.672676   0.577953        -0.840484         -0.038680        0.046588             0.105777      -0.809117  ...       1.374854         0.579001    0.427906         -0.809587           0.350735         0.326767              0.414069       -1.104549                -0.318409\n567     1.838341      2.336457        1.982524   1.735218         1.525767          3.272144        3.296944             2.658866       2.137194  ...       2.237926         2.303601    1.653171          1.430427           3.904848         3.197605              2.289985        1.919083                 2.219635\n568    -1.808401      1.221792       -1.814389  -1.347789        -3.112085         -1.150752       -1.114873            -1.261820      -0.820070  ...       0.764190        -1.432735   -1.075813         -1.859019          -1.207552        -1.305831             -1.745063       -0.048138                -0.751207\n\n[569 rows x 30 columns]\n</code></pre>"}, {"location": "API/data_cleaning/scaler/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformApply the inverse transformation to the data.set_paramsSet the parameters of this estimator.transformPerform standardization by centering and scaling.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X, y=None)[source]Apply the inverse transformation to the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Scaled dataframe. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Perform standardization by centering and scaling.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Scaled dataframe. </p> <p></p>"}, {"location": "API/feature_engineering/featureextractor/", "title": "FeatureExtractor", "text": "<p>class atom.feature_engineering.FeatureExtractor(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, verbose=0, logger=None)[source]Extract features from datetime columns.</p> <p>Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype <code>datetime64</code> are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.</p> <p>This class can be accessed from atom through the feature_extraction method. Read more in the user guide.</p> <p>Warning</p> <p>Decision trees based algorithms build their split rules according to one feature at a time. This means that they will fail to correctly process cyclic features since the sin/cos features should be considered one single coordinate system.</p> <p>Parametersfeatures: str or sequence, default=(\"day\", \"month\", \"year\") Features to create from the datetime columns. Note that created features with zero variance (e.g., the feature hour in a column that only contains dates) are ignored. Allowed values are datetime attributes from <code>pandas.Series.dt</code>. <p>fmt: str, sequence or None, default=None Format (<code>strptime</code>) of the categorical columns that need to be converted to datetime. If sequence, the n-th format corresponds to the n-th categorical column that can be successfully converted. If None, the format is inferred automatically from the first non NaN value. Values that cannot be converted are returned as <code>NaT</code>. <p>encoding_type: str, default=\"ordinal\" Type of encoding to use. Choose from: <ul> <li>\"ordinal\": Encode features in increasing order.</li> <li>\"cyclic\": Encode features using sine and cosine to capture   their cyclic nature. This approach creates two columns for   every feature. Non-cyclic features still use ordinal encoding.</li> </ul> <p>drop_columns: bool, default=True Whether to drop the original columns after transformation. <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>Attributesfeature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>FeatureGenerator Generate new features.</p> <p>FeatureGrouper Extract statistics from similar features.</p> <p>FeatureSelector Reduce the number of features in the data.</p> <p></p>"}, {"location": "API/feature_engineering/featureextractor/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Add a datetime column\n&gt;&gt;&gt; X[\"date\"] = pd.date_range(start=\"1/1/2018\", periods=len(X))\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y)\n&gt;&gt;&gt; atom.feature_extraction(features=[\"day\"], fmt=\"%d/%m/%Y\", verbose=2)\n\nFitting FeatureExtractor...\nExtracting datetime features...\n --&gt; Extracting features from column date.\n   --&gt; Creating feature date_day.\n\n\n&gt;&gt;&gt; # Note the date_day column\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  date_day  target\n0         12.770         22.47           81.72      506.3          0.09055           0.05761         0.04711              0.02704         0.1585  ...       653.6            0.1419             0.1523           0.2177               0.09331          0.2829                  0.08067        16       0\n1         27.420         26.27          186.90     2501.0          0.10840           0.19880         0.36350              0.16890         0.2061  ...      4254.0            0.1357             0.4256           0.6833               0.26250          0.2641                  0.07427         7       0\n2         15.850         23.95          103.70      782.7          0.08401           0.10020         0.09938              0.05364         0.1847  ...       876.5            0.1131             0.1924           0.2322               0.11190          0.2809                  0.06287        14       0\n3         14.190         23.81           92.87      610.7          0.09463           0.13060         0.11150              0.06462         0.2235  ...       811.3            0.1559             0.4059           0.3744               0.17720          0.4724                  0.10260         3       0\n4          8.950         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...       270.0            0.1179             0.1879           0.1544               0.03846          0.1652                  0.07722        27       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...         ...               ...                ...              ...                   ...             ...                      ...       ...     ...\n564       10.800         21.98           68.79      359.9          0.08801           0.05743         0.03614              0.01404         0.2016  ...       489.5            0.1303             0.1696           0.1927               0.07485          0.2965                  0.07662         4       1\n565       11.930         10.91           76.14      442.7          0.08872           0.05242         0.02606              0.01796         0.1601  ...       589.5            0.1374             0.1575           0.1514               0.06876          0.2460                  0.07262         6       1\n566       24.630         21.60          165.50     1841.0          0.10300           0.21060         0.23100              0.14710         0.1991  ...      2642.0            0.1342             0.4188           0.4658               0.24750          0.3157                  0.09671         6       0\n567        6.981         13.43           43.79      143.5          0.11700           0.07568         0.00000              0.00000         0.1930  ...       185.2            0.1584             0.1202           0.0000               0.00000          0.2932                  0.09382        12       1\n568       15.050         19.07           97.26      701.9          0.09215           0.08597         0.07486              0.04335         0.1561  ...       967.0            0.1246             0.2101           0.2866               0.11200          0.2282                  0.06954        30       0\n\n[569 rows x 32 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from atom.feature_engineering import FeatureExtractor\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Add a datetime column\n&gt;&gt;&gt; X[\"date\"] = pd.date_range(start=\"1/1/2018\", periods=len(X))\n\n&gt;&gt;&gt; fe = FeatureExtractor(features=[\"day\"], fmt=\"%Y-%m-%d\", verbose=2)\n&gt;&gt;&gt; X = fe.transform(X)\n\nExtracting datetime features...\n --&gt; Extracting features from column date.\n   --&gt; Creating feature date_day.\n\n\n&gt;&gt;&gt; # Note the date_day column\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  date_day\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...           184.60      2019.0           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890         1\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...           158.80      1956.0           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902         2\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...           152.50      1709.0           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758         3\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...            98.87       567.7           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300         4\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...           152.20      1575.0           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678         5\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...       ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...           166.10      2027.0           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115        19\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...           155.00      1731.0           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637        20\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...           126.70      1124.0           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820        21\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...           184.60      1821.0           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400        22\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...            59.16       268.6           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039        23\n\n[569 rows x 31 columns]\n</code></pre>"}, {"location": "API/feature_engineering/featureextractor/#methods", "title": "Methods", "text": "<p>fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformExtract the new features.</p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Do nothing.</p> <p>Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Extract the new features.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed feature set. </p> <p></p>"}, {"location": "API/feature_engineering/featuregenerator/", "title": "FeatureGenerator", "text": "<p>class atom.feature_engineering.FeatureGenerator(strategy=\"dfs\", n_features=None, operators=None, n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs)[source]Generate new features.</p> <p>Create new combinations of existing features to capture the non-linear relations between the original features.</p> <p>This class can be accessed from atom through the feature_generation method. Read more in the user guide.</p> <p>Warning</p> <ul> <li>Using the <code>div</code>, <code>log</code> or <code>sqrt</code> operators can return new   features with <code>inf</code> or <code>NaN</code> values. Check the warnings that   may pop up or use atom's nans attribute.</li> <li>When using dfs with <code>n_jobs&gt;1</code>, make sure to protect your code   with <code>if __name__ == \"__main__\"</code>. Featuretools uses   dask, which uses python multiprocessing   for parallelization. The spawn method on multiprocessing   starts a new python process, which requires it to import the   __main__ module before it can do its task.</li> <li>gfg can be slow for very large populations.</li> </ul> <p>Tip</p> <p>dfs can create many new features and not all of them will be useful. Use the FeatureSelector class to reduce the number of features.</p> <p>Parametersstrategy: str, default=\"dfs\" Strategy to crate new features. Choose from: <ul> <li>\"dfs\": Deep Feature Synthesis.</li> <li>\"gfg\": Genetic Feature Generation.</li> </ul> <p>n_features: int or None, default=None Maximum number of newly generated features to add to the dataset. If None, select all created features. <p>operators: str, sequence or None, default=None Mathematical operators to apply on the features. None to use all. Choose from: <code>add</code>, <code>sub</code>, <code>mul</code>, <code>div</code>, <code>abs</code>, <code>sqrt</code>, <code>log</code>, <code>inv</code>, <code>sin</code>, <code>cos</code>, <code>tan</code>. <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p>**kwargs Additional keyword arguments for the SymbolicTransformer instance. Only for the gfg strategy. <p>Attributesgfg_: SymbolicTransformer Object used to calculate the genetic features. Only available when strategy=\"gfg\". <p>genetic_features_: pd.DataFrame Information on the newly created non-linear features. Only available when strategy=\"gfg\". Columns include: <ul> <li>name: Name of the feature (generated automatically).</li> <li>description: Operators used to create this feature.</li> <li>fitness: Fitness score.</li> </ul> <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>FeatureExtractor Extract features from datetime columns.</p> <p>FeatureGrouper Extract statistics from similar features.</p> <p>FeatureSelector Reduce the number of features in the data.</p> <p></p>"}, {"location": "API/feature_engineering/featuregenerator/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y)\n&gt;&gt;&gt; atom.feature_generation(strategy=\"dfs\", n_features=5, verbose=2)\n\nFitting FeatureGenerator...\nGenerating new features...\n --&gt; 5 new features were added.\n\n\n&gt;&gt;&gt; # Note the texture error / worst symmetry column\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  ...  mean concave points * smoothness error  mean concavity + worst radius  mean radius / smoothness error  worst concave points * worst radius  worst radius / concave points error  target\n0         13.280         13.72           85.79      541.8          0.08363           0.08575  ...                                0.000122                       14.29077                     3109.342074                             1.306235                          1681.624941       1\n1         15.460         11.89          102.50      736.9          0.12570           0.15550  ...                                0.000592                       18.99320                     2866.679028                             3.432933                          1423.484848       0\n2         13.110         15.56           87.21      530.2          0.13980           0.17650  ...                                0.000688                       16.51710                     1830.494275                             3.239166                          1175.072046       0\n3          9.847         15.68           63.00      293.2          0.09492           0.08419  ...                                0.000211                       11.26330                     1127.691251                             0.733747                          1652.698133       1\n4         14.870         20.21           96.12      680.9          0.09587           0.08345  ...                                0.000268                       16.07824                     2746.075716                             1.628217                          1353.338969       1\n..           ...           ...             ...        ...              ...               ...  ...                                     ...                            ...                             ...                                  ...                                  ...     ...\n564       14.470         24.99           95.81      656.4          0.08837           0.12300  ...                                0.000278                       16.32090                     2027.178481                             1.954510                          1395.869191       1\n565       19.690         21.25          130.00     1203.0          0.10960           0.15990  ...                                0.000787                       23.76740                     3201.626016                             5.727510                          1145.286686       0\n566       19.270         26.47          127.90     1162.0          0.09401           0.17190  ...                                0.000381                       24.31570                     3842.472582                             4.310775                          2504.407342       0\n567       11.760         18.14           75.00      431.1          0.09968           0.05914  ...                                0.000197                       13.38685                     2101.501072                             0.956576                           932.960894       0\n568       14.580         13.66           94.29      658.8          0.09832           0.08918  ...                                0.000215                       16.84222                     2943.670503                             1.539574                          1938.020352       1\n\n[569 rows x 36 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.feature_engineering import FeatureGenerator\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; fg = FeatureGenerator(strategy=\"dfs\", n_features=5, verbose=2)\n&gt;&gt;&gt; X = fg.fit_transform(X, y)\n\nFitting FeatureGenerator...\nGenerating new features...\n --&gt; 5 new features were added.\n\n\n&gt;&gt;&gt; # Note the radius error * worst smoothness column\n&gt;&gt;&gt; print(X)\n\n       mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  ...  worst fractal dimension  mean area - perimeter error  mean texture * worst fractal dimension  symmetry error / concave points error  texture error * worst area  worst radius / compactness error\nindex                                                                                           ...                                                                                                                                                                                                   \n0            17.99         10.38          122.80     1001.0          0.11840           0.27760  ...                  0.11890                      992.411                                1.234182                               1.892250                   1827.8007                        517.536705\n1            20.57         17.77          132.90     1326.0          0.08474           0.07864  ...                  0.08902                     1322.602                                1.581885                               1.036567                   1435.5084                       1910.550459\n2            19.69         21.25          130.00     1203.0          0.10960           0.15990  ...                  0.08758                     1198.415                                1.861075                               1.093294                   1344.8121                        588.367449\n3            11.42         20.38           77.58      386.1          0.14250           0.28390  ...                  0.17300                      382.655                                3.525740                               3.193894                    656.2612                        199.919549\n4            20.29         14.34          135.10     1297.0          0.10030           0.13280  ...                  0.07678                     1291.562                                1.101025                               0.931565                   1230.5475                        915.887850\n...            ...           ...             ...        ...              ...               ...  ...                      ...                          ...                                     ...                                    ...                         ...                               ...\n564          21.56         22.39          142.00     1479.0          0.11100           0.11590  ...                  0.07115                     1471.327                                1.593049                               0.453953                   2545.9120                        880.318229\n565          20.13         28.25          131.20     1261.0          0.09780           0.10340  ...                  0.06637                     1255.797                                1.874953                               1.131108                   4263.4530                        977.713578\n566          16.60         28.08          108.30      858.1          0.08455           0.10230  ...                  0.07820                      854.675                                2.195856                               0.846500                   1208.3000                        508.710801\n567          20.60         29.33          140.10     1265.0          0.11780           0.27700  ...                  0.12400                     1259.228                                3.636920                               1.396635                   2904.4950                        417.992855\n568           7.76         24.54           47.92      181.0          0.05263           0.04362  ...                  0.07039                      178.452                                1.727371                                    inf                    383.5608                       2029.184549\n\n[569 rows x 35 columns]\n</code></pre>"}, {"location": "API/feature_engineering/featuregenerator/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformGenerate new features.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Generate new features.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed feature set. </p> <p></p>"}, {"location": "API/feature_engineering/featuregrouper/", "title": "FeatureGrouper", "text": "<p>class atom.feature_engineering.FeatureGrouper(groups, operators=None, drop_columns=True, verbose=0, logger=None)[source]Extract statistics from similar features.</p> <p>Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the <code>groups</code> method.</p> <p>This class can be accessed from atom through the feature_grouping method. Read more in the user guide.</p> <p>Parametersgroups: dict Group names and features. A feature can belong to multiple groups. <p>operators: str, sequence or None, default=None Statistical operators to apply on the groups. Any operator from <code>numpy</code> or <code>scipy.stats</code> (checked in that order) that is applied on an array can be used. If None, it uses: <code>min</code>, <code>max</code>, <code>mean</code>, <code>median</code>, <code>mode</code> and <code>std</code>. <p>drop_columns: bool, default=True Whether to drop the columns in <code>groups</code> after transformation. <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>Attributesfeature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>FeatureExtractor Extract features from datetime columns.</p> <p>FeatureGenerator Generate new features.</p> <p>FeatureSelector Reduce the number of features in the data.</p> <p></p>"}, {"location": "API/feature_engineering/featuregrouper/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y)\n&gt;&gt;&gt; atom.feature_grouping({\"group1\": \"mean.*\"}, verbose=2)\n\nFitting FeatureGrouper...\nGrouping features...\n --&gt; Group group1 successfully created.\n\n\n&gt;&gt;&gt; print(atom.dataset)\n\n     radius error  texture error  perimeter error  area error  smoothness error  compactness error  concavity error  concave points error  symmetry error  ...  worst symmetry  worst fractal dimension  min(group1)  max(group1)  mean(group1)  median(group1)  mode(group1)  std(group1)  target\n0          0.5190         2.9100            5.801       67.10          0.007545           0.060500         0.021340              0.018430         0.03056  ...          0.2311                  0.09203      0.07224       1132.0    130.736684        0.186400       0.07224   335.890773       0\n1          0.4564         1.0750            3.425       48.55          0.005903           0.037310         0.047300              0.015570         0.01318  ...          0.2218                  0.07820      0.05302        858.1    101.162786        0.130650       0.05302   254.320568       0\n2          0.2298         0.9988            1.534       22.18          0.002826           0.009105         0.013110              0.005174         0.01013  ...          0.2683                  0.06829      0.02847        758.6     89.400425        0.116550       0.02847   224.981976       0\n3          0.3117         0.8155            1.972       27.94          0.005217           0.015150         0.016780              0.012680         0.01669  ...          0.2723                  0.07071      0.05723        761.7     89.389875        0.138110       0.09462   226.081026       1\n4          0.3336         1.8600            2.041       19.91          0.011880           0.037470         0.045910              0.015440         0.02287  ...          0.2383                  0.09026      0.03068        334.2     43.414796        0.161250       0.03068    99.030712       1\n..            ...            ...              ...         ...               ...                ...              ...                   ...             ...  ...             ...                      ...          ...          ...           ...             ...           ...          ...     ...\n564        0.4727         1.2400            3.195       45.40          0.005718           0.011620         0.019980              0.011090         0.01410  ...          0.3029                  0.08216      0.05259        684.5     81.456503        0.128635       0.05259   202.924880       0\n565        0.8601         1.4800            7.029      111.70          0.008124           0.036110         0.054890              0.027650         0.03176  ...          0.2909                  0.05865      0.05024       1290.0    146.813205        0.170250       0.05024   383.094862       0\n566        0.2094         0.7636            1.231       17.67          0.008725           0.020030         0.023350              0.011320         0.02625  ...          0.3380                  0.09584      0.03370        513.7     62.632288        0.136750       0.03370   152.314252       1\n567        0.2818         0.7614            1.808       18.54          0.006142           0.006134         0.001835              0.003576         0.01637  ...          0.2738                  0.07685      0.00309        366.8     45.967364        0.109675       0.00309   108.819747       1\n568        0.2810         0.8135            3.369       23.81          0.004929           0.066570         0.076830              0.013680         0.01526  ...          0.2845                  0.12490      0.02833        542.9     66.369889        0.141200       0.02833   160.878141       1\n\n[569 rows x 27 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.feature_engineering import FeatureGrouper\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; fg = FeatureGrouper({\"group1\": [\"mean texture\", \"mean radius\"]}, verbose=2)\n&gt;&gt;&gt; X = fg.transform(X)\n\nGrouping features...\n --&gt; Group group1 successfully created.\n\n\n&gt;&gt;&gt; print(X)\n\n     mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  mean fractal dimension  radius error  ...  worst concave points  worst symmetry  worst fractal dimension  min(group1)  max(group1)  mean(group1)  median(group1)  mode(group1)  std(group1)\n0            122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419                 0.07871        1.0950  ...                0.2654          0.4601                  0.11890        10.38        17.99        14.185          14.185         10.38        3.805\n1            132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812                 0.05667        0.5435  ...                0.1860          0.2750                  0.08902        17.77        20.57        19.170          19.170         17.77        1.400\n2            130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069                 0.05999        0.7456  ...                0.2430          0.3613                  0.08758        19.69        21.25        20.470          20.470         19.69        0.780\n3             77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597                 0.09744        0.4956  ...                0.2575          0.6638                  0.17300        11.42        20.38        15.900          15.900         11.42        4.480\n4            135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809                 0.05883        0.7572  ...                0.1625          0.2364                  0.07678        14.34        20.29        17.315          17.315         14.34        2.975\n..              ...        ...              ...               ...             ...                  ...            ...                     ...           ...  ...                   ...             ...                      ...          ...          ...           ...             ...           ...          ...\n564          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726                 0.05623        1.1760  ...                0.2216          0.2060                  0.07115        21.56        22.39        21.975          21.975         21.56        0.415\n565          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752                 0.05533        0.7655  ...                0.1628          0.2572                  0.06637        20.13        28.25        24.190          24.190         20.13        4.060\n566          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590                 0.05648        0.4564  ...                0.1418          0.2218                  0.07820        16.60        28.08        22.340          22.340         16.60        5.740\n567          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397                 0.07016        0.7260  ...                0.2650          0.4087                  0.12400        20.60        29.33        24.965          24.965         20.60        4.365\n568           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587                 0.05884        0.3857  ...                0.0000          0.2871                  0.07039         7.76        24.54        16.150          16.150          7.76        8.390\n\n[569 rows x 34 columns]\n</code></pre>"}, {"location": "API/feature_engineering/featuregrouper/#methods", "title": "Methods", "text": "<p>fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformGroup features.</p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Do nothing.</p> <p>Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Group features.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed feature set. </p> <p></p>"}, {"location": "API/feature_engineering/featureselector/", "title": "FeatureSelector", "text": "<p>class atom.feature_engineering.FeatureSelector(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", verbose=0, logger=None, random_state=None, **kwargs)[source]Reduce the number of features in the data.</p> <p>Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.</p> <p>This class can be accessed from atom through the feature_selection method. Read more in the user guide.</p> <p>Warning</p> <ul> <li>Ties between features with equal scores are broken in an   unspecified way.</li> <li>For strategy=\"rfecv\", the <code>n_features</code> parameter is the   minimum number of features to select, not the actual   number of features that the transformer returns. It may very   well be that it returns more!</li> </ul> <p>Info</p> <ul> <li>The \"sklearnex\" and \"cuml\" engines are only supported for   strategy=\"pca\" with dense datasets.</li> <li>If strategy=\"pca\" and the data is dense and unscaled, it's   scaled to mean=0 and std=1 before fitting the PCA transformer.</li> <li>If strategy=\"pca\" and the provided data is sparse, the used   estimator is TruncatedSVD, which works more efficiently   with sparse matrices.</li> </ul> <p>Tip</p> <ul> <li>Use the plot_pca and plot_components methods to   examine the results after using strategy=\"pca\".</li> <li>Use the plot_rfecv method to examine the results after   using strategy=\"rfecv\".</li> <li>Use the plot_feature_importance method to examine how   much a specific feature contributes to the final predictions.   If the model doesn't have a <code>feature_importances_</code> attribute,   use plot_permutation_importance instead.</li> </ul> <p>Parametersstrategy: str or None, default=None Feature selection strategy to use. Choose from: <ul> <li>None: Do not perform any feature selection strategy.</li> <li>\"univariate\": Univariate statistical F-test.</li> <li>\"pca\": Principal Component Analysis.</li> <li>\"sfm\": Select best features according to a model.</li> <li>\"sfs\": Sequential Feature Selection.</li> <li>\"rfe\": Recursive Feature Elimination.</li> <li>\"rfecv\": RFE with cross-validated selection.</li> <li>\"pso\": Particle Swarm Optimization.</li> <li>\"hho\": Harris Hawks Optimization.</li> <li>\"gwo\": Grey Wolf Optimization.</li> <li>\"dfo\": Dragonfly Optimization.</li> <li>\"go\": Genetic Optimization.</li> </ul> <p>solver: str, func, estimator or None, default=None Solver/estimator to use for the feature selection strategy. See the corresponding documentation for an extended description of the choices. If None, the default value is used (only if strategy=\"pca\"). Choose from: <ul> <li> <p>If strategy=\"univariate\":</p> <ul> <li>\"f_classif\"</li> <li>\"f_regression\"</li> <li>\"mutual_info_classif\"</li> <li>\"mutual_info_regression\"</li> <li>\"chi2\"</li> <li>Any function with signature <code>func(X, y) -&gt; tuple[scores, p-values]</code>.</li> </ul> </li> <li> <p>If strategy=\"pca\":</p> <ul> <li> <p>If data is dense:</p> <ul> <li> <p>If engine=\"sklearn\":</p> <ul> <li>\"auto\" (default)</li> <li>\"full\"</li> <li>\"arpack\"</li> <li>\"randomized\"</li> </ul> </li> <li> <p>If engine=\"sklearnex\":</p> <ul> <li>\"full\" (default)</li> </ul> </li> <li> <p>If engine=\"cuml\":</p> <ul> <li>\"full\" (default)</li> <li>\"jacobi\"</li> </ul> </li> </ul> </li> <li> <p>If data is sparse:</p> <ul> <li>\"randomized\" (default)</li> <li>\"arpack\"</li> </ul> </li> </ul> </li> <li> <p>for the remaining strategies:   The base estimator. For sfm, rfe and rfecv, it should have   either a <code>feature_importances_</code> or <code>coef_</code> attribute after   fitting. You can use one of the predefined models. Add   <code>_class</code> or <code>_reg</code> after the model's  name to specify a   classification or regression task, e.g., <code>solver=\"LGB_reg\"</code>   (not necessary if called from atom). No default option.</p> </li> </ul> <p>n_features: int, float or None, default=None Number of features to select. <ul> <li>If None: Select all features.</li> <li>If &lt;1: Fraction of the total features to select.</li> <li>If &gt;=1: Number of features to select.</li> </ul> <p>If strategy=\"sfm\" and the threshold parameter is not specified, the threshold is automatically set to <code>-inf</code> to select <code>n_features</code> number of features.</p> <p>If strategy=\"rfecv\", <code>n_features</code> is the minimum number of features to select.</p> <p>This parameter is ignored if any of the following strategies is selected: pso, hho, gwo, dfo, go.</p> <p>min_repeated: int, float or None, default=2 Remove categorical features if there isn't any repeated value in at least <code>min_repeated</code> rows. The default is to keep all features with non-maximum variance, i.e., remove the features which number of unique values is equal to the number of rows (usually the case for names, IDs, etc...). <ul> <li>If None: No check for minimum repetition.</li> <li>If &gt;1: Minimum repetition number.</li> <li>If &lt;=1: Minimum repetition fraction.</li> </ul> <p>max_repeated: int, float or None, default=1.0 Remove categorical features with the same value in at least <code>max_repeated</code> rows. The default is to keep all features with non-zero variance, i.e., remove the features that have the same value in all samples. <ul> <li>If None: No check for maximum repetition.</li> <li>If &gt;1: Maximum number of repeated occurences.</li> <li>If &lt;=1: Maximum fraction of repeated occurences.</li> </ul> <p>max_correlation: float or None, default=1.0 Minimum absolute Pearson correlation to identify correlated features. For each group, it removes all except the feature with the highest correlation to <code>y</code> (if provided, else it removes all but the first). The default value removes equal columns. If None, skip this step. <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p>**kwargs Any extra keyword argument for the strategy estimator. See the corresponding documentation for the available options. <p>Attributescollinear_: pd.DataFrame Information on the removed collinear features. Columns include: <ul> <li>drop: Name of the dropped feature.</li> <li>corr_feature: Names of the correlated features.</li> <li>corr_value: Corresponding correlation coefficients.</li> </ul> <p>[strategy]_: sklearn transformer Object used to transform the data, e.g., <code>fs.pca</code> for the pca strategy. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>FeatureExtractor Extract features from datetime columns.</p> <p>FeatureGenerator Generate new features.</p> <p>FeatureGrouper Extract statistics from similar features.</p> <p></p>"}, {"location": "API/feature_engineering/featureselector/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y)\n&gt;&gt;&gt; atom.feature_selection(strategy=\"pca\", n_features=12, verbose=2)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --&gt; Applying Principal Component Analysis...\n   --&gt; Scaling features...\n   --&gt; Keeping 12 components.\n   --&gt; Explained variance ratio: 0.971\n\n\n&gt;&gt;&gt; # Note that the column names changed\n&gt;&gt;&gt; print(atom.dataset)\n\n         pca0      pca1      pca2      pca3      pca4      pca5      pca6      pca7      pca8      pca9     pca10     pca11  target\n0    1.933532  2.215152  1.268851 -1.776239  0.069615 -0.043647  0.281363  0.122942 -0.911086 -0.223754 -0.086316 -0.929486       1\n1    1.203025  6.706587  4.445104  0.087116  3.044271 -1.130720  0.820790 -0.593311 -1.004105  0.945411 -0.199241  0.948766       1\n2    4.506063 -1.419715 -1.216228  1.189962  0.227850  0.788522 -0.829805  0.521853 -0.381054  0.676945  0.004564  0.066630       0\n3   -2.179059  0.496110 -0.870279 -0.151235 -0.715354  0.983901 -0.232186  0.449653  0.350218  0.644448  0.280308 -0.544707       1\n4    0.708048  0.859536 -2.683579  0.295765  0.712158 -1.105250 -0.226270 -0.264257  0.494656 -0.643629 -0.152528 -0.008835       0\n..        ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...     ...\n564 -2.477152 -1.482251 -0.389774 -0.333742  0.627651 -0.475717 -0.048757 -0.337669  0.382336  0.132000  0.204445  0.118625       1\n565 -0.400165  0.078366 -2.082886 -1.024593  0.623709 -1.003931  0.571384  0.248557 -0.489957 -0.397008 -0.132552 -0.162104       0\n566 -2.956303 -0.111232 -0.770455  0.035805  0.308638  0.311849  0.119611 -0.994997  0.495694 -0.130586  0.214798  0.358027       1\n567 -5.409548 -0.784989  1.540835  2.205277  0.249963  1.552586  1.837439 -0.796343  0.508352  0.011600 -0.066693 -0.006518       1\n568 -3.648393 -1.340745  0.503077  4.546174 -0.221396  1.229170  0.687803  0.711380  0.527799  0.139843 -0.958308  0.834252       1\n\n[569 rows x 13 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.feature_engineering import FeatureSelector\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; fs = FeatureSelector(strategy=\"pca\", n_features=12, verbose=2)\n&gt;&gt;&gt; X = fs.fit_transform(X)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --&gt; Applying Principal Component Analysis...\n   --&gt; Scaling features...\n   --&gt; Keeping 12 components.\n   --&gt; Explained variance ratio: 0.97\n\n\n&gt;&gt;&gt; # Note that the column names changed\n&gt;&gt;&gt; print(X)\n\n          pca0       pca1      pca2      pca3      pca4      pca5      pca6      pca7      pca8      pca9     pca10     pca11\n0     9.192837   1.948583 -1.123166  3.633731 -1.195110  1.411424  2.159370 -0.398407 -0.157118 -0.877402  0.262955 -0.859014\n1     2.387802  -3.768172 -0.529293  1.118264  0.621775  0.028656  0.013358  0.240988 -0.711905  1.106995  0.813120  0.157923\n2     5.733896  -1.075174 -0.551748  0.912083 -0.177086  0.541452 -0.668166  0.097374  0.024066  0.454275 -0.605604  0.124387\n3     7.122953  10.275589 -3.232790  0.152547 -2.960878  3.053422  1.429911  1.059565 -1.405440 -1.116975 -1.151514  1.011316\n4     3.935302  -1.948072  1.389767  2.940639  0.546747 -1.226495 -0.936213  0.636376 -0.263805  0.377704  0.651360 -0.110515\n..         ...        ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...\n564   6.439315  -3.576817  2.459487  1.177314 -0.074824 -2.375193 -0.596130 -0.035471  0.987929  0.256989 -0.062651  0.123342\n565   3.793382  -3.584048  2.088476 -2.506028 -0.510723 -0.246710 -0.716326 -1.113360 -0.105207 -0.108632  0.244804  0.222753\n566   1.256179  -1.902297  0.562731 -2.089227  1.809991 -0.534447 -0.192758  0.341887  0.393917  0.520877 -0.840512  0.096473\n567  10.374794   1.672010 -1.877029 -2.356031 -0.033742  0.567936  0.223082 -0.280239 -0.542035 -0.089296 -0.178628 -0.697461\n568  -5.475243  -0.670637  1.490443 -2.299157 -0.184703  1.617837  1.698952  1.046354  0.374101 -0.047726 -0.144094 -0.179496\n\n[569 rows x 12 columns]\n</code></pre>"}, {"location": "API/feature_engineering/featureselector/#methods", "title": "Methods", "text": "<p>fitFit the feature selector to the data.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformTransform the data.</p> <p></p> <p>method fit(X, y=None)[source]Fit the feature selector to the data.</p> <p>The univariate, sfm (when model is not fitted), sfs, rfe and rfecv strategies need a target column. Leaving it None raises an exception.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_metadata_routing()[source]Get metadata routing of this object.</p> <p>Returnsrouting : MetadataRequest A :class:<code>~sklearn.utils.metadata_routing.MetadataRequest</code> encapsulating routing information. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Transform the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed feature set. </p> <p></p>"}, {"location": "API/models/adab/", "title": "AdaBoost", "text": "<p>AdaB accept sparse</p> <p>AdaBoost is a meta-estimator that begins by fitting a classifier/regressor on the original dataset and then fits additional copies of the algorithm on the same dataset but where the weights of instances are adjusted according to the error of the current prediction.</p> <p>Corresponding estimators are:</p> <ul> <li>AdaBoostClassifier for classification tasks.</li> <li>AdaBoostRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>GradientBoostingMachine Gradient Boosting Machine.</p> <p>RandomForest Random Forest.</p> <p>XGBoost Extreme Gradient Boosting.</p> <p></p>"}, {"location": "API/models/adab/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"AdaB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: AdaB\nMetric: f1\n\n\nResults for AdaBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9583\nTime elapsed: 0.221s\n-------------------------------------------------\nTime: 0.221s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.224s\n-------------------------------------\nAdaBoost --&gt; f1: 0.9583\n</code></pre>"}, {"location": "API/models/adab/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=50, step=10)learning_rateFloatDistribution(high=10.0, log=True, low=0.01, step=None)algorithmCategoricalDistribution(choices=('SAMME.R', 'SAMME'))</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=50, step=10)learning_rateFloatDistribution(high=10.0, log=True, low=0.01, step=None)lossCategoricalDistribution(choices=('linear', 'square', 'exponential'))</p> <p></p> <p></p>"}, {"location": "API/models/adab/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/adab/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/adab/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/adab/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/ard/", "title": "AutomaticRelevanceDetermination", "text": "<p>ARD needs scaling</p> <p>Automatic Relevance Determination is very similar to BayesianRidge, but can lead to sparser coefficients. Fit the weights of a regression model, using an ARD prior. The weights of the regression model are assumed to be in Gaussian distributions.</p> <p>Corresponding estimators are:</p> <ul> <li>ARDRegression for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>BayesianRidge Bayesian ridge regression.</p> <p>GaussianProcess Gaussian process.</p> <p>LeastAngleRegression Least Angle Regression.</p> <p></p>"}, {"location": "API/models/ard/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"ARD\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: ARD\nMetric: r2\n\n\nResults for AutomaticRelevanceDetermination:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6067\nTest evaluation --&gt; r2: 0.6029\nTime elapsed: 0.139s\n-------------------------------------------------\nTime: 0.139s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.140s\n-------------------------------------\nAutomaticRelevanceDetermination --&gt; r2: 0.6029\n</code></pre>"}, {"location": "API/models/ard/#hyperparameters", "title": "Hyperparameters", "text": "<p>Parametersn_iterIntDistribution(high=1000, log=False, low=100, step=10)alpha_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)alpha_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/ard/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ard/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/ard/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/ard/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/arima/", "title": "ARIMA", "text": "<p>ARIMA native multioutput</p> <p>Seasonal ARIMA models and exogeneous input is supported, hence this estimator is capable of fitting SARIMA, ARIMAX, and SARIMAX.</p> <p>An ARIMA model, is a generalization of an autoregressive moving average (ARMA) model, and is fitted to time-series data in an effort to forecast future points. ARIMA models can be especially efficacious in cases where data shows evidence of non-stationarity.</p> <p>The \"AR\" part of ARIMA indicates that the evolving variable of interest is regressed on its own lagged (i.e., prior observed) values. The \"MA\" part indicates that the regression error is actually a linear combination of error terms whose values occurred contemporaneously and at various times in the past. The \"I\" (for \"integrated\") indicates that the data values have been replaced with the difference between their values and the previous values (and this differencing process may have been performed more than once).</p> <p>Corresponding estimators are:</p> <ul> <li>ARIMA for forecasting tasks.</li> </ul> <p>Warning</p> <p>ARIMA often runs into numerical errors when optimizing the hyperparameters. Possible solutions are:</p> <ul> <li>Use the AutoARIMA model instead.</li> <li>Use <code>est_params</code> to specify the   orders manually, e.g., <code>atom.run(\"arima\", n_trials=5,est_params={\"order\": (1, 1, 0)})</code>.</li> <li>Use the <code>catch</code> parameter in <code>ht_params</code>   to avoid raising every exception, e.g., <code>atom.run(\"arima\",n_trials=5, ht_params={\"catch\": (Exception,)})</code>.</li> </ul> <p></p> <p>See Also</p> <p>AutoARIMA Automatic Autoregressive Integrated Moving Average Model.</p> <p></p>"}, {"location": "API/models/arima/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_longley\n\n&gt;&gt;&gt; _, X = load_longley()\n\n&gt;&gt;&gt; atom = ATOMForecaster(X)\n&gt;&gt;&gt; atom.run(models=\"ARIMA\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: ARIMA\nMetric: mape\n\n\nResults for ARIMA:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0131\nTest evaluation --&gt; mape: -0.0364\nTime elapsed: 0.214s\n-------------------------------------------------\nTime: 0.214s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.215s\n-------------------------------------\nARIMA --&gt; mape: -0.0364\n</code></pre>"}, {"location": "API/models/arima/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParameterspIntDistribution(high=2, log=False, low=0, step=1)dIntDistribution(high=1, log=False, low=0, step=1)qIntDistribution(high=2, log=False, low=0, step=1)PIntDistribution(high=2, log=False, low=0, step=1)DIntDistribution(high=1, log=False, low=0, step=1)QIntDistribution(high=2, log=False, low=0, step=1)SCategoricalDistribution(choices=(0, 4, 6, 7, 12))methodCategoricalDistribution(choices=('newton', 'nm', 'bfgs', 'lbfgs', 'powell', 'cg', 'ncg', 'basinhopping'))maxiterIntDistribution(high=200, log=False, low=50, step=10)with_interceptCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/arima/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/arima/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/arima/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/arima/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_interval</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>marginal: bool, default=True Whether returned distribution is marginal by time index. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnssktime.proba.Normal Predicted distribution. </p> <p></p> <p>method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_quantiles</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_residuals</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>y</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_var</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at. <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of y with respect to a ground truth. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/autoarima/", "title": "AutoARIMA", "text": "<p>AutoARIMA native multioutput</p> <p>ARIMA implementation that includes automated fitting of (S)ARIMA(X) hyperparameters (p, d, q, P, D, Q). The AutoARIMA algorithm seeks to identify the most optimal parameters for an ARIMA model, settling on a single fitted ARIMA model. This process is based on the commonly-used R function.</p> <p>AutoARIMA works by conducting differencing tests (i.e., Kwiatkowski\u2013Phillips\u2013Schmidt\u2013Shin, Augmented Dickey-Fuller or Phillips\u2013Perron) to determine the order of differencing, d, and then fitting models within defined ranges. AutoARIMA also seeks to identify the optimal P and Q hyperparameters after conducting the Canova-Hansen to determine the optimal order of seasonal differencing.</p> <p>Note that due to stationarity issues, AutoARIMA might not find a suitable model that will converge. If this is the case, a ValueError is thrown suggesting stationarity-inducing measures be taken prior to re-fitting or that a new range of order values be selected.</p> <p>Corresponding estimators are:</p> <ul> <li>AutoARIMA for forecasting tasks.</li> </ul> <p></p> <p>See Also</p> <p>ARIMA Autoregressive Integrated Moving Average Model.</p> <p>ETS ETS model with automatic fitting capabilities.</p> <p></p>"}, {"location": "API/models/autoarima/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_longley\n\n&gt;&gt;&gt; _, X = load_longley()\n\n&gt;&gt;&gt; atom = ATOMForecaster(X, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"autoarima\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: AutoARIMA\nMetric: mape\n\n\nResults for AutoARIMA:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0131\nTest evaluation --&gt; mape: -0.0359\nTime elapsed: 0.437s\n-------------------------------------------------\nTime: 0.437s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.438s\n-------------------------------------\nAutoARIMA --&gt; mape: -0.0359\n</code></pre>"}, {"location": "API/models/autoarima/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParametersmethodCategoricalDistribution(choices=('newton', 'nm', 'bfgs', 'lbfgs', 'powell', 'cg', 'ncg', 'basinhopping'))maxiterIntDistribution(high=200, log=False, low=50, step=10)with_interceptCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/autoarima/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/autoarima/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/autoarima/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/autoarima/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_interval</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>marginal: bool, default=True Whether returned distribution is marginal by time index. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnssktime.proba.Normal Predicted distribution. </p> <p></p> <p>method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_quantiles</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_residuals</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>y</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_var</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at. <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of y with respect to a ground truth. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/bag/", "title": "Bagging", "text": "<p>Bag accept sparse</p> <p>Bagging uses an ensemble meta-estimator that fits base predictors on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator by introducing randomization into its construction procedure and then making an ensemble out of it.</p> <p>Corresponding estimators are:</p> <ul> <li>BaggingClassifier for classification tasks.</li> <li>BaggingRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>DecisionTree Single Decision Tree.</p> <p>LogisticRegression Logistic Regression.</p> <p>RandomForest Random Forest.</p> <p></p>"}, {"location": "API/models/bag/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Bag\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Bag\nMetric: f1\n\n\nResults for Bagging:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9982\nTest evaluation --&gt; f1: 0.9444\nTime elapsed: 0.101s\n-------------------------------------------------\nTime: 0.101s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.104s\n-------------------------------------\nBagging --&gt; f1: 0.9444\n</code></pre>"}, {"location": "API/models/bag/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)max_samplesFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)max_featuresFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)bootstrapCategoricalDistribution(choices=(True, False))bootstrap_featuresCategoricalDistribution(choices=(True, False))</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)max_samplesFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)max_featuresFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)bootstrapCategoricalDistribution(choices=(True, False))bootstrap_featuresCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/bag/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/bag/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/bag/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/bag/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/bnb/", "title": "BernoulliNB", "text": "<p>BNB accept sparse supports acceleration</p> <p>BernoulliNB implements the Naive Bayes algorithm for multivariate Bernoulli models. Like MultinomialNB, this classifier is suitable for discrete data. The difference is that while MNB works with occurrence counts, BNB is designed for binary/boolean features.</p> <p>Corresponding estimators are:</p> <ul> <li>BernoulliNB for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>ComplementNB Complement Naive Bayes.</p> <p>CategoricalNB Categorical Naive Bayes.</p> <p>MultinomialNB Multinomial Naive Bayes.</p> <p></p>"}, {"location": "API/models/bnb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"BNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: BNB\nMetric: f1\n\n\nResults for BernoulliNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.7709\nTest evaluation --&gt; f1: 0.7717\nTime elapsed: 0.023s\n-------------------------------------------------\nTime: 0.023s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.026s\n-------------------------------------\nBernoulliNB --&gt; f1: 0.7717\n</code></pre>"}, {"location": "API/models/bnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/bnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/bnb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/bnb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/bnb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/br/", "title": "BayesianRidge", "text": "<p>BR needs scaling</p> <p>Bayesian regression techniques can be used to include regularization parameters in the estimation procedure: the regularization parameter is not set in a hard sense but tuned to the data at hand.</p> <p>Corresponding estimators are:</p> <ul> <li>BayesianRidge for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>AutomaticRelevanceDetermination Automatic Relevance Determination.</p> <p>GaussianProcess Gaussian process.</p> <p>LeastAngleRegression Least Angle Regression.</p> <p></p>"}, {"location": "API/models/br/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"BR\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: BR\nMetric: r2\n\n\nResults for BayesianRidge:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6067\nTest evaluation --&gt; r2: 0.6028\nTime elapsed: 0.138s\n-------------------------------------------------\nTime: 0.138s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.139s\n-------------------------------------\nBayesianRidge --&gt; r2: 0.6028\n</code></pre>"}, {"location": "API/models/br/#hyperparameters", "title": "Hyperparameters", "text": "<p>Parametersn_iterIntDistribution(high=1000, log=False, low=100, step=10)alpha_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)alpha_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/br/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/br/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/br/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/br/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/catb/", "title": "CatBoost", "text": "<p>CatB needs scaling accept sparse allows validation supports acceleration</p> <p>CatBoost is a machine learning method based on gradient boosting over decision trees. Main advantages of CatBoost:</p> <ul> <li>Superior quality when compared with other GBDT models on many   datasets.</li> <li>Best in class prediction speed.</li> </ul> <p>Corresponding estimators are:</p> <ul> <li>CatBoostClassifier for classification tasks.</li> <li>CatBoostRegressor for regression tasks.</li> </ul> <p>Read more in CatBoost's documentation.</p> <p>Warning</p> <ul> <li>CatBoost selects the weights achieved by the best evaluation   on the test set after training. This means that, by default,   there is some minor data leakage in the test set. Use the   <code>use_best_model=False</code> parameter to avoid this behavior or use   a holdout set to evaluate the final estimator.</li> <li>In-training validation and pruning are disabled when   <code>device=\"gpu\"</code>.</li> </ul> <p>Note</p> <p>ATOM uses CatBoost's <code>n_estimators</code> parameter instead of <code>iterations</code> to indicate the number of trees to fit. This is done to have consistent naming with the XGBoost and LightGBM models.</p> <p></p> <p>See Also</p> <p>GradientBoostingMachine Gradient Boosting Machine.</p> <p>LightGBM Light Gradient Boosting Machine.</p> <p>XGBoost Extreme Gradient Boosting.</p> <p></p>"}, {"location": "API/models/catb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"CatB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: CatB\nMetric: f1\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9655\nTime elapsed: 14.218s\n-------------------------------------------------\nTime: 14.218s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 14.221s\n-------------------------------------\nCatBoost --&gt; f1: 0.9655\n</code></pre>"}, {"location": "API/models/catb/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)bootstrap_typeCategoricalDistribution(choices=('Bayesian', 'Bernoulli'))bagging_temperatureFloatDistribution(high=10.0, log=False, low=0.0, step=None)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.001, step=None)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)bootstrap_typeCategoricalDistribution(choices=('Bayesian', 'Bernoulli'))bagging_temperatureFloatDistribution(high=10.0, log=False, low=0.0, step=None)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.001, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/catb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/catb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/catb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/catb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/catnb/", "title": "CategoricalNB", "text": "<p>CatNB accept sparse supports acceleration</p> <p>Categorical Naive Bayes implements the Naive Bayes algorithm for categorical features.</p> <p>Corresponding estimators are:</p> <ul> <li>CategoricalNB for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>BernoulliNB Bernoulli Naive Bayes.</p> <p>ComplementNB Complement Naive Bayes.</p> <p>GaussianNB Gaussian Naive Bayes.</p> <p></p>"}, {"location": "API/models/catnb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; import numpy as np\n\n&gt;&gt;&gt; X = np.random.randint(5, size=(100, 100))\n&gt;&gt;&gt; y = np.random.randint(2, size=100)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"CatNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: CatNB\nMetric: f1\n\n\nResults for CategoricalNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.4444\nTime elapsed: 0.029s\n-------------------------------------------------\nTime: 0.029s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.032s\n-------------------------------------\nCategoricalNB --&gt; f1: 0.4444 ~\n</code></pre>"}, {"location": "API/models/catnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/catnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/catnb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/catnb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/catnb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/cnb/", "title": "ComplementNB", "text": "<p>CNB accept sparse supports acceleration</p> <p>The Complement Naive Bayes classifier was designed to correct the \"severe assumptions\" made by the standard MultinomialNB classifier. It is particularly suited for imbalanced datasets.</p> <p>Corresponding estimators are:</p> <ul> <li>ComplementNB for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>BernoulliNB Bernoulli Naive Bayes.</p> <p>CategoricalNB Categorical Naive Bayes.</p> <p>MultinomialNB Multinomial Naive Bayes.</p> <p></p>"}, {"location": "API/models/cnb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"CNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: CNB\nMetric: f1\n\n\nResults for ComplementNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9221\nTest evaluation --&gt; f1: 0.9128\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.023s\n-------------------------------------\nComplementNB --&gt; f1: 0.9128\n</code></pre>"}, {"location": "API/models/cnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))normCategoricalDistribution(choices=(True, False))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))normCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/cnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/cnb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/cnb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/cnb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/dummy/", "title": "Dummy", "text": "<p>Dummy</p> <p>When doing supervised learning, a simple sanity check consists of comparing one's estimator against simple rules of thumb. The prediction methods completely ignore the input data. Do not use this model for real problems. Use it only as a simple baseline to compare with other models.</p> <p>Corresponding estimators are:</p> <ul> <li>DummyClassifier for classification tasks.</li> <li>DummyRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>DecisionTree Single Decision Tree.</p> <p>ExtraTree Extremely Randomized Tree.</p> <p>NaiveForecaster Naive Forecaster.</p> <p></p>"}, {"location": "API/models/dummy/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Dummy\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Dummy\nMetric: f1\n\n\nResults for Dummy:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.7709\nTest evaluation --&gt; f1: 0.7717\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.021s\n-------------------------------------\nDummy --&gt; f1: 0.7717\n</code></pre>"}, {"location": "API/models/dummy/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParametersstrategyCategoricalDistribution(choices=('most_frequent', 'prior', 'stratified', 'uniform'))</p> <p>ParametersstrategyCategoricalDistribution(choices=('mean', 'median', 'quantile'))quantileFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p></p> <p></p>"}, {"location": "API/models/dummy/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/dummy/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/dummy/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/dummy/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/en/", "title": "ElasticNet", "text": "<p>EN needs scaling accept sparse supports acceleration</p> <p>Linear least squares with l1 and l2 regularization.</p> <p>Corresponding estimators are:</p> <ul> <li>ElasticNet for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>Lasso Linear Regression with lasso regularization.</p> <p>OrdinaryLeastSquares Linear Regression.</p> <p>Ridge Linear least squares with l2 regularization.</p> <p></p>"}, {"location": "API/models/en/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"EN\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: EN\nMetric: r2\n\n\nResults for ElasticNet:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.2061\nTest evaluation --&gt; r2: 0.2016\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.139s\n-------------------------------------\nElasticNet --&gt; r2: 0.2016\n</code></pre>"}, {"location": "API/models/en/#hyperparameters", "title": "Hyperparameters", "text": "sklearnsklearnexcuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> cpugpu <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> <p></p> <p></p>"}, {"location": "API/models/en/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/en/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/en/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/en/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/es/", "title": "ExponentialSmoothing", "text": "<p>ES native multioutput</p> <p>Holt-Winters exponential smoothing forecaster. The default settings use simple exponential smoothing, without trend and seasonality components.</p> <p>Corresponding estimators are:</p> <ul> <li>ExponentialSmoothing for forecasting tasks.</li> </ul> <p></p> <p>See Also</p> <p>ARIMA Autoregressive Integrated Moving Average Model.</p> <p>ETS ETS model with automatic fitting capabilities.</p> <p>PolynomialTrend Polynomial Trend forecaster.</p> <p></p>"}, {"location": "API/models/es/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; atom = ATOMForecaster(y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"ES\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: ES\nMetric: mape\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0864\nTest evaluation --&gt; mape: -0.2303\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.020s\n-------------------------------------\nExponentialSmoothing --&gt; mape: -0.2303\n</code></pre>"}, {"location": "API/models/es/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParameterstrendCategoricalDistribution(choices=('add', 'mul', None))damped_trendCategoricalDistribution(choices=(True, False))seasonalCategoricalDistribution(choices=('add', 'mul', None))spCategoricalDistribution(choices=(4, 6, 7, 12, None))use_boxcoxCategoricalDistribution(choices=(True, False))initialization_methodCategoricalDistribution(choices=('estimated', 'heuristic'))methodCategoricalDistribution(choices=('L-BFGS-B', 'TNC', 'SLSQP', 'Powell', 'trust-constr', 'bh', 'ls'))</p> <p></p> <p></p>"}, {"location": "API/models/es/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/es/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/es/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/es/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_interval</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>marginal: bool, default=True Whether returned distribution is marginal by time index. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnssktime.proba.Normal Predicted distribution. </p> <p></p> <p>method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_quantiles</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_residuals</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>y</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_var</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at. <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of y with respect to a ground truth. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/et/", "title": "ExtraTrees", "text": "<p>ET accept sparse native multilabel native multioutput</p> <p>Extra-Trees use a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.</p> <p>Corresponding estimators are:</p> <ul> <li>ExtraTreesClassifier for classification tasks.</li> <li>ExtraTreesRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>DecisionTree Single Decision Tree.</p> <p>ExtraTree Extremely Randomized Tree.</p> <p>RandomForest Random Forest.</p> <p></p>"}, {"location": "API/models/et/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"ET\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: ET\nMetric: f1\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9655\nTime elapsed: 0.110s\n-------------------------------------------------\nTime: 0.110s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.112s\n-------------------------------------\nExtraTrees --&gt; f1: 0.9655\n</code></pre>"}, {"location": "API/models/et/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p></p> <p></p>"}, {"location": "API/models/et/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/et/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/et/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/et/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/etree/", "title": "ExtraTree", "text": "<p>ETree accept sparse native multilabel native multioutput</p> <p>Extra-trees differ from classic decision trees in the way they are built. When looking for the best split to separate the samples of a node into two groups, random splits are drawn for each of the max_features randomly selected features and the best split among those is chosen. When max_features is set 1, this amounts to building a totally random decision tree.</p> <p>Corresponding estimators are:</p> <ul> <li>ExtraTreeClassifier for classification tasks.</li> <li>ExtraTreeRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>DecisionTree Single Decision Tree.</p> <p>ExtraTrees Extremely Randomized Trees.</p> <p>RandomForest Random Forest.</p> <p></p>"}, {"location": "API/models/etree/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"ETree\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: ETree\nMetric: f1\n\n\nResults for ExtraTree:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9241\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.024s\n-------------------------------------\nExtraTree --&gt; f1: 0.9241\n</code></pre>"}, {"location": "API/models/etree/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParameterscriterionCategoricalDistribution(choices=('gini', 'entropy'))splitterCategoricalDistribution(choices=('random', 'best'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>ParameterscriterionCategoricalDistribution(choices=('squared_error', 'absolute_error'))splitterCategoricalDistribution(choices=('random', 'best'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p></p> <p></p>"}, {"location": "API/models/etree/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/etree/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/etree/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/etree/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/ets/", "title": "ETS", "text": "<p>ETS native multioutput</p> <p>The ETS models are a family of time series models with an underlying state space model consisting of a level component, a trend component (T), a seasonal component (S), and an error term (E).</p> <p>Corresponding estimators are:</p> <ul> <li>AutoETS for forecasting tasks.</li> </ul> <p></p> <p>See Also</p> <p>ARIMA Autoregressive Integrated Moving Average Model.</p> <p>ExponentialSmoothing Exponential Smoothing forecaster.</p> <p>PolynomialTrend Polynomial Trend forecaster.</p> <p></p>"}, {"location": "API/models/ets/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; atom = ATOMForecaster(y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"ETS\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: ETS\nMetric: mape\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0858\nTest evaluation --&gt; mape: -0.2305\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.022s\n-------------------------------------\nETS --&gt; mape: -0.2305\n</code></pre>"}, {"location": "API/models/ets/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParameterserrorCategoricalDistribution(choices=('add', 'mul'))trendCategoricalDistribution(choices=('add', 'mul', None))damped_trendCategoricalDistribution(choices=(True, False))seasonalCategoricalDistribution(choices=('add', 'mul', None))spCategoricalDistribution(choices=(1, 4, 6, 7, 12))initialization_methodCategoricalDistribution(choices=('estimated', 'heuristic'))maxiterIntDistribution(high=2000, log=False, low=500, step=100)autoCategoricalDistribution(choices=(True, False))information_criterionCategoricalDistribution(choices=('aic', 'bic', 'aicc'))</p> <p></p> <p></p>"}, {"location": "API/models/ets/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ets/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/ets/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/ets/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_interval</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>marginal: bool, default=True Whether returned distribution is marginal by time index. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnssktime.proba.Normal Predicted distribution. </p> <p></p> <p>method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_quantiles</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_residuals</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>y</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_var</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at. <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of y with respect to a ground truth. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/gbm/", "title": "GradientBoostingMachine", "text": "<p>GBM accept sparse</p> <p>A Gradient Boosting Machine builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage <code>n_classes_</code> regression trees are fit on the negative gradient of the loss function, e.g. binary or multiclass log loss. Binary classification is a special case where only a single regression tree is induced.</p> <p>Corresponding estimators are:</p> <ul> <li>GradientBoostingClassifier for classification tasks.</li> <li>GradientBoostingRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p>Tip</p> <p>HistGradientBoosting is a much faster variant of this algorithm for intermediate datasets (n_samples &gt;= 10k).</p> <p></p> <p>See Also</p> <p>CatBoost Cat Boosting Machine.</p> <p>HistGradientBoosting Histogram-based Gradient Boosting Machine.</p> <p>LightGBM Light Gradient Boosting Machine.</p> <p></p>"}, {"location": "API/models/gbm/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"GBM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: GBM\nMetric: f1\n\n\nResults for GradientBoostingMachine:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9589\nTime elapsed: 0.886s\n-------------------------------------------------\nTime: 0.886s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.890s\n-------------------------------------\nGradientBoostingMachine --&gt; f1: 0.9589\n</code></pre>"}, {"location": "API/models/gbm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParameterslossCategoricalDistribution(choices=('log_loss', 'exponential'))learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)n_estimatorsIntDistribution(high=500, log=False, low=10, step=10)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)criterionCategoricalDistribution(choices=('friedman_mse', 'squared_error'))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_depthIntDistribution(high=21, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>ParameterslossCategoricalDistribution(choices=('squared_error', 'absolute_error', 'huber', 'quantile'))learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)n_estimatorsIntDistribution(high=500, log=False, low=10, step=10)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)criterionCategoricalDistribution(choices=('friedman_mse', 'squared_error'))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_depthIntDistribution(high=21, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)alphaFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)</p> <p></p> <p></p>"}, {"location": "API/models/gbm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/gbm/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/gbm/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/gbm/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/gnb/", "title": "GaussianNB", "text": "<p>GNB supports acceleration</p> <p>Gaussian Naive Bayes implements the Naive Bayes algorithm for classification. The likelihood of the features is assumed to be Gaussian.</p> <p>Corresponding estimators are:</p> <ul> <li>GaussianNB for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>BernoulliNB Bernoulli Naive Bayes.</p> <p>CategoricalNB Categorical Naive Bayes.</p> <p>ComplementNB Complement Naive Bayes.</p> <p></p>"}, {"location": "API/models/gnb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"GNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: GNB\nMetric: f1\n\n\nResults for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9553\nTest evaluation --&gt; f1: 0.9371\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.022s\n-------------------------------------\nGaussianNB --&gt; f1: 0.9371\n</code></pre>"}, {"location": "API/models/gnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/gnb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/gnb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/gnb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/gp/", "title": "GaussianProcess", "text": "<p>GP</p> <p>Gaussian Processes are a generic supervised learning method designed to solve regression and probabilistic classification problems. The advantages of Gaussian processes are:</p> <ul> <li>The prediction interpolates the observations.</li> <li>The prediction is probabilistic (Gaussian) so that one can compute   empirical confidence intervals and decide based on those if one   should refit (online fitting, adaptive fitting) the prediction in   some region of interest.</li> </ul> <p>The disadvantages of Gaussian processes include:</p> <ul> <li>They are not sparse, i.e., they use the whole samples/features   information to perform the prediction.</li> <li>They lose efficiency in high dimensional spaces, namely when the   number of features exceeds a few dozens.</li> </ul> <p>Corresponding estimators are:</p> <ul> <li>GaussianProcessClassifier for classification tasks.</li> <li>GaussianProcessRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>GaussianNB Gaussian Naive Bayes.</p> <p>LinearDiscriminantAnalysis Linear Discriminant Analysis.</p> <p>PassiveAggressive Passive Aggressive.</p> <p></p>"}, {"location": "API/models/gp/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"GP\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: GP\nMetric: f1\n\n\nResults for GaussianProcess:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9437\nTime elapsed: 0.105s\n-------------------------------------------------\nTime: 0.105s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.109s\n-------------------------------------\nGaussianProcess --&gt; f1: 0.9437\n</code></pre>"}, {"location": "API/models/gp/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/gp/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/gp/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/gp/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/hgbm/", "title": "HistGradientBoosting", "text": "<p>hGBM</p> <p>This Histogram-based Gradient Boosting Machine is much faster than the standard GradientBoostingMachine for big datasets (n_samples&gt;=10k). This variation first bins the input samples into integer-valued bins which tremendously reduces the number of splitting points to consider, and allows the algorithm to leverage integer-based data structures (histograms) instead of relying on sorted continuous values when building the trees.</p> <p>Corresponding estimators are:</p> <ul> <li>HistGradientBoostingClassifier for classification tasks.</li> <li>HistGradientBoostingRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>CatBoost Cat Boosting Machine.</p> <p>GradientBoostingMachine Gradient Boosting Machine.</p> <p>XGBoost Extreme Gradient Boosting.</p> <p></p>"}, {"location": "API/models/hgbm/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"hGBM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: hGBM\nMetric: f1\n\n\nResults for HistGradientBoosting:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9583\nTime elapsed: 0.357s\n-------------------------------------------------\nTime: 0.357s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.360s\n-------------------------------------\nHistGradientBoosting --&gt; f1: 0.9583\n</code></pre>"}, {"location": "API/models/hgbm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parameterslearning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_iterIntDistribution(high=500, log=False, low=10, step=10)max_leaf_nodesIntDistribution(high=50, log=False, low=10, step=1)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_leafIntDistribution(high=30, log=False, low=10, step=1)l2_regularizationFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p>ParameterslossCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson', 'quantile', 'gamma'))quantileFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_iterIntDistribution(high=500, log=False, low=10, step=10)max_leaf_nodesIntDistribution(high=50, log=False, low=10, step=1)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_leafIntDistribution(high=30, log=False, low=10, step=1)l2_regularizationFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p></p> <p></p>"}, {"location": "API/models/hgbm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/hgbm/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/hgbm/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/hgbm/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/huber/", "title": "HuberRegression", "text": "<p>Huber needs scaling</p> <p>Huber is a linear regression model that is robust to outliers. It makes sure that the loss function is not heavily influenced by the outliers while not completely ignoring their effect.</p> <p>Corresponding estimators are:</p> <ul> <li>HuberRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>AutomaticRelevanceDetermination Automatic Relevance Determination.</p> <p>LeastAngleRegression Least Angle Regression.</p> <p>OrdinaryLeastSquares Linear Regression.</p> <p></p>"}, {"location": "API/models/huber/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Huber\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Huber\nMetric: r2\n\n\nResults for HuberRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.546\nTest evaluation --&gt; r2: 0.5999\nTime elapsed: 0.187s\n-------------------------------------------------\nTime: 0.187s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.187s\n-------------------------------------\nHuberRegression --&gt; r2: 0.5999\n</code></pre>"}, {"location": "API/models/huber/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParametersepsilonFloatDistribution(high=10.0, log=True, low=1.0, step=None)max_iterIntDistribution(high=500, log=False, low=50, step=10)alphaFloatDistribution(high=1.0, log=True, low=0.0001, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/huber/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/huber/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/huber/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/huber/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/knn/", "title": "KNearestNeighbors", "text": "<p>KNN needs scaling accept sparse native multilabel native multioutput supports acceleration</p> <p>K-Nearest Neighbors, as the name clearly indicates, implements the k-nearest neighbors vote. For regression, the target is predicted by local interpolation of the targets associated of the nearest neighbors in the training set.</p> <p>Corresponding estimators are:</p> <ul> <li>KNeighborsClassifier for classification tasks.</li> <li>KNeighborsRegressor for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>LinearDiscriminantAnalysis Linear Discriminant Analysis.</p> <p>QuadraticDiscriminantAnalysis Quadratic Discriminant Analysis.</p> <p>RadiusNearestNeighbors Radius Nearest Neighbors.</p> <p></p>"}, {"location": "API/models/knn/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"KNN\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: KNN\nMetric: f1\n\n\nResults for KNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.981\nTest evaluation --&gt; f1: 0.9793\nTime elapsed: 0.116s\n-------------------------------------------------\nTime: 0.116s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.119s\n-------------------------------------\nKNearestNeighbors --&gt; f1: 0.9793\n</code></pre>"}, {"location": "API/models/knn/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> cpugpu <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> sklearnsklearnexcuml <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> cpugpu <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p></p> <p></p>"}, {"location": "API/models/knn/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/knn/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/knn/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/knn/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/lars/", "title": "LeastAngleRegression", "text": "<p>Lars needs scaling</p> <p>Least-Angle Regression is a regression algorithm for high-dimensional data. Lars is similar to forward stepwise regression. At each step, it finds the feature most correlated with the target. When there are multiple features having equal correlation, instead of continuing along the same feature, it proceeds in a direction equiangular between the features.</p> <p>Corresponding estimators are:</p> <ul> <li>Lars for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>BayesianRidge Bayesian ridge regression.</p> <p>HuberRegression Huber regressor.</p> <p>OrdinaryLeastSquares Linear Regression.</p> <p></p>"}, {"location": "API/models/lars/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Lars\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Lars\nMetric: r2\n\n\nResults for LeastAngleRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6067\nTest evaluation --&gt; r2: 0.6028\nTime elapsed: 0.136s\n-------------------------------------------------\nTime: 0.136s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.137s\n-------------------------------------\nLeastAngleRegression --&gt; r2: 0.6028\n</code></pre>"}, {"location": "API/models/lars/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lars/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/lars/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/lars/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/lasso/", "title": "Lasso", "text": "<p>Lasso needs scaling accept sparse supports acceleration</p> <p>Linear least squares with l1 regularization.</p> <p>Corresponding estimators are:</p> <ul> <li>Lasso for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>ElasticNet Linear Regression with elasticnet regularization.</p> <p>OrdinaryLeastSquares Linear Regression.</p> <p>Ridge Linear least squares with l2 regularization.</p> <p></p>"}, {"location": "API/models/lasso/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Lasso\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Lasso\nMetric: r2\n\n\nResults for Lasso:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.0\nTest evaluation --&gt; r2: -0.0001\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.139s\n-------------------------------------\nLasso --&gt; r2: -0.0001 ~\n</code></pre>"}, {"location": "API/models/lasso/#hyperparameters", "title": "Hyperparameters", "text": "sklearnsklearnexcuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> cpugpu <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> <p></p> <p></p>"}, {"location": "API/models/lasso/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lasso/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/lasso/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/lasso/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/lda/", "title": "LinearDiscriminantAnalysis", "text": "<p>LDA</p> <p>Linear Discriminant Analysis is a classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes\u2019 rule. The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix.</p> <p>Corresponding estimators are:</p> <ul> <li>LinearDiscriminantAnalysis for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>LogisticRegression Logistic Regression.</p> <p>RadiusNearestNeighbors Radius Nearest Neighbors.</p> <p>QuadraticDiscriminantAnalysis Quadratic Discriminant Analysis.</p> <p></p>"}, {"location": "API/models/lda/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"LDA\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: LDA\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9743\nTest evaluation --&gt; f1: 0.9726\nTime elapsed: 0.025s\n-------------------------------------------------\nTime: 0.025s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.028s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.9726\n</code></pre>"}, {"location": "API/models/lda/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParameterssolverCategoricalDistribution(choices=('svd', 'lsqr', 'eigen'))shrinkageCategoricalDistribution(choices=(None, 'auto', 0.5, 0.6, 0.7, 0.8, 0.9, 1.0))</p> <p></p> <p></p>"}, {"location": "API/models/lda/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lda/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/lda/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/lda/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/lgb/", "title": "LightGBM", "text": "<p>LGB needs scaling accept sparse allows validation supports acceleration</p> <p>LightGBM is a gradient boosting model that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:</p> <ul> <li>Faster training speed and higher efficiency.</li> <li>Lower memory usage.</li> <li>Better accuracy.</li> <li>Capable of handling large-scale data.</li> </ul> <p>Corresponding estimators are:</p> <ul> <li>LGBMClassifier for classification tasks.</li> <li>LGBMRegressor for regression tasks.</li> </ul> <p>Read more in LightGBM's documentation.</p> <p>Info</p> <p>Using LightGBM's GPU acceleration requires additional software dependencies.</p> <p></p> <p>See Also</p> <p>CatBoost Cat Boosting Machine.</p> <p>GradientBoostingMachine Gradient Boosting Machine.</p> <p>XGBoost Extreme Gradient Boosting.</p> <p></p>"}, {"location": "API/models/lgb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"LGB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: LGB\nMetric: f1\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9583\nTime elapsed: 0.426s\n-------------------------------------------------\nTime: 0.426s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.429s\n-------------------------------------\nLightGBM --&gt; f1: 0.9583\n</code></pre>"}, {"location": "API/models/lgb/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=17, log=False, low=-1, step=2)num_leavesIntDistribution(high=40, log=False, low=20, step=1)min_child_weightFloatDistribution(high=100.0, log=True, low=0.0001, step=None)min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=17, log=False, low=-1, step=2)num_leavesIntDistribution(high=40, log=False, low=20, step=1)min_child_weightFloatDistribution(high=100.0, log=True, low=0.0001, step=None)min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/lgb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lgb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/lgb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/lgb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/lr/", "title": "LogisticRegression", "text": "<p>LR needs scaling accept sparse supports acceleration</p> <p>Logistic regression, despite its name, is a linear model for classification rather than regression. Logistic regression is also known in the literature as logit regression, maximum-entropy classification (MaxEnt) or the log-linear classifier. In this model, the probabilities describing the possible outcomes of a single trial are modeled using a logistic function.</p> <p>Corresponding estimators are:</p> <ul> <li>LogisticRegression for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>GaussianProcess Gaussian process.</p> <p>LinearDiscriminantAnalysis Linear Discriminant Analysis.</p> <p>PassiveAggressive Passive Aggressive.</p> <p></p>"}, {"location": "API/models/lr/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"RF\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9524\nTime elapsed: 0.229s\n-------------------------------------------------\nTime: 0.229s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.232s\n-------------------------------------\nRandomForest --&gt; f1: 0.9524\n</code></pre>"}, {"location": "API/models/lr/#hyperparameters", "title": "Hyperparameters", "text": "sklearnsklearnexcuml <p>ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> cpugpu <p>ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p>ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p>ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p></p> <p></p>"}, {"location": "API/models/lr/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lr/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/lr/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/lr/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/lsvm/", "title": "LinearSVM", "text": "<p>lSVM needs scaling accept sparse supports acceleration</p> <p>Similar to SupportVectorMachine but with a linear kernel. Implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.</p> <p>Corresponding estimators are:</p> <ul> <li>LinearSVC for classification tasks.</li> <li>LinearSVR for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>KNearestNeighbors K-Nearest Neighbors.</p> <p>StochasticGradientDescent Stochastic Gradient Descent.</p> <p>SupportVectorMachine Support Vector Machine.</p> <p></p>"}, {"location": "API/models/lsvm/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"lSVM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: lSVM\nMetric: f1\n\n\nResults for LinearSVM:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.993\nTest evaluation --&gt; f1: 0.9722\nTime elapsed: 0.089s\n-------------------------------------------------\nTime: 0.089s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.092s\n-------------------------------------\nLinearSVM --&gt; f1: 0.9722\n</code></pre>"}, {"location": "API/models/lsvm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearncuml <p>ParameterspenaltyCategoricalDistribution(choices=('l1', 'l2'))lossCategoricalDistribution(choices=('hinge', 'squared_hinge'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))</p> <p>ParameterspenaltyCategoricalDistribution(choices=('l1', 'l2'))lossCategoricalDistribution(choices=('hinge', 'squared_hinge'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))</p> sklearncuml <p>ParameterslossCategoricalDistribution(choices=('epsilon_insensitive', 'squared_epsilon_insensitive'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))</p> <p>ParameterslossCategoricalDistribution(choices=('epsilon_insensitive', 'squared_epsilon_insensitive'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/lsvm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lsvm/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/lsvm/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/lsvm/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/mlp/", "title": "MultiLayerPerceptron", "text": "<p>MLP needs scaling accept sparse native multilabel allows validation</p> <p>Multi-layer Perceptron is a supervised learning algorithm that learns a function by training on a dataset. Given a set of features and a target, it can learn a non-linear function approximator for either classification or regression. It is different from logistic regression, in that between the input and the output layer, there can be one or more non-linear layers, called hidden layers.</p> <p>Corresponding estimators are:</p> <ul> <li>MLPClassifier for classification tasks.</li> <li>MLPRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>PassiveAggressive Passive Aggressive.</p> <p>Perceptron Linear Perceptron classification.</p> <p>StochasticGradientDescent Stochastic Gradient Descent.</p> <p></p>"}, {"location": "API/models/mlp/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"MLP\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: MLP\nMetric: f1\n\n\nResults for MultiLayerPerceptron:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9965\nTest evaluation --&gt; f1: 0.979\nTime elapsed: 1.783s\n-------------------------------------------------\nTime: 1.783s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.786s\n-------------------------------------\nMultiLayerPerceptron --&gt; f1: 0.979\n</code></pre>"}, {"location": "API/models/mlp/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametershidden_layer_1IntDistribution(high=100, log=False, low=10, step=1)hidden_layer_2IntDistribution(high=100, log=False, low=0, step=1)hidden_layer_3IntDistribution(high=10, log=False, low=0, step=1)activationCategoricalDistribution(choices=('identity', 'logistic', 'tanh', 'relu'))solverCategoricalDistribution(choices=('lbfgs', 'sgd', 'adam'))alphaFloatDistribution(high=0.1, log=True, low=0.0001, step=None)batch_sizeCategoricalDistribution(choices=('auto', 8, 16, 32, 64, 128, 256))learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'adaptive'))learning_rate_initFloatDistribution(high=0.1, log=True, low=0.001, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=500, log=False, low=50, step=10)</p> <p>Parametershidden_layer_1IntDistribution(high=100, log=False, low=10, step=1)hidden_layer_2IntDistribution(high=100, log=False, low=0, step=1)hidden_layer_3IntDistribution(high=10, log=False, low=0, step=1)activationCategoricalDistribution(choices=('identity', 'logistic', 'tanh', 'relu'))solverCategoricalDistribution(choices=('lbfgs', 'sgd', 'adam'))alphaFloatDistribution(high=0.1, log=True, low=0.0001, step=None)batch_sizeCategoricalDistribution(choices=('auto', 8, 16, 32, 64, 128, 256))learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'adaptive'))learning_rate_initFloatDistribution(high=0.1, log=True, low=0.001, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=500, log=False, low=50, step=10)</p> <p></p> <p></p>"}, {"location": "API/models/mlp/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/mlp/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/mlp/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/mlp/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/mnb/", "title": "MultinomialNB", "text": "<p>MNB accept sparse supports acceleration</p> <p>MultinomialNB implements the Naive Bayes algorithm for multinomially distributed data, and is one of the two classic Naive Bayes variants used in text classification (where the data are typically represented as word vector counts, although tf-idf vectors are also known to work well in practice).</p> <p>Corresponding estimators are:</p> <ul> <li>MultinomialNB for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>BernoulliNB Bernoulli Naive Bayes.</p> <p>ComplementNB Complement Naive Bayes.</p> <p>GaussianNB Gaussian Naive Bayes.</p> <p></p>"}, {"location": "API/models/mnb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"MNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: MNB\nMetric: f1\n\n\nResults for MultinomialNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9238\nTest evaluation --&gt; f1: 0.9128\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.024s\n-------------------------------------\nMultinomialNB --&gt; f1: 0.9128\n</code></pre>"}, {"location": "API/models/mnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/mnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/mnb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/mnb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/mnb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/nf/", "title": "NaiveForecaster", "text": "<p>NF native multioutput</p> <p>NaiveForecaster is a dummy forecaster that makes forecasts using simple strategies based on naive assumptions about past trends continuing. When used in multivariate tasks, each column is forecasted with the same strategy.</p> <p>Corresponding estimators are:</p> <ul> <li>NaiveForecaster for forecasting tasks.</li> </ul> <p></p> <p>See Also</p> <p>ExponentialSmoothing Exponential Smoothing forecaster.</p> <p>Dummy Dummy classifier/regressor.</p> <p>PolynomialTrend Polynomial Trend forecaster.</p> <p></p>"}, {"location": "API/models/nf/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; atom = ATOMForecaster(y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"NF\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: NF\nMetric: mape\n\n\nResults for NaiveForecaster:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0858\nTest evaluation --&gt; mape: -0.2305\nTime elapsed: 0.022s\n-------------------------------------------------\nTime: 0.022s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.023s\n-------------------------------------\nNaiveForecaster --&gt; mape: -0.2305\n</code></pre>"}, {"location": "API/models/nf/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParametersstrategyCategoricalDistribution(choices=('last', 'mean', 'drift'))</p> <p></p> <p></p>"}, {"location": "API/models/nf/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/nf/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/nf/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/nf/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_interval</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>marginal: bool, default=True Whether returned distribution is marginal by time index. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnssktime.proba.Normal Predicted distribution. </p> <p></p> <p>method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_quantiles</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_residuals</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>y</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_var</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at. <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of y with respect to a ground truth. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/ols/", "title": "OrdinaryLeastSquares", "text": "<p>OLS needs scaling accept sparse supports acceleration</p> <p>Ordinary Least Squares is just linear regression without any regularization. It fits a linear model with coefficients <code>w=(w1,  ..., wp)</code> to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation.</p> <p>Corresponding estimators are:</p> <ul> <li>LinearRegression for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>ElasticNet Linear Regression with elasticnet regularization.</p> <p>Lasso Linear Regression with lasso regularization.</p> <p>Ridge Linear least squares with l2 regularization.</p> <p></p>"}, {"location": "API/models/ols/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"OLS\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: OLS\nMetric: r2\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6067\nTest evaluation --&gt; r2: 0.6028\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.138s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: 0.6028\n</code></pre>"}, {"location": "API/models/ols/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ols/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/ols/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/ols/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/omp/", "title": "OrthogonalMatchingPursuit", "text": "<p>OMP needs scaling</p> <p>Orthogonal Matching Pursuit implements the OMP algorithm for approximating the fit of a linear model with constraints imposed on the number of non-zero coefficients.</p> <p>Corresponding estimators are:</p> <ul> <li>OrthogonalMatchingPursuit for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>Lasso Linear Regression with lasso regularization.</p> <p>LeastAngleRegression Least Angle Regression.</p> <p>OrdinaryLeastSquares Linear Regression.</p> <p></p>"}, {"location": "API/models/omp/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"OMP\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: OMP\nMetric: r2\n\n\nResults for OrthogonalMatchingPursuit:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.4751\nTest evaluation --&gt; r2: 0.4668\nTime elapsed: 0.135s\n-------------------------------------------------\nTime: 0.135s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.136s\n-------------------------------------\nOrthogonalMatchingPursuit --&gt; r2: 0.4668\n</code></pre>"}, {"location": "API/models/omp/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/omp/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/omp/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/omp/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/pa/", "title": "PassiveAggressive", "text": "<p>PA needs scaling accept sparse allows validation</p> <p>The passive-aggressive algorithms are a family of algorithms for large-scale learning. They are similar to the Perceptron in that they do not require a learning rate. However, contrary to the Perceptron, they include a regularization parameter <code>C</code>.</p> <p>Corresponding estimators are:</p> <ul> <li>PassiveAggressiveClassifier for classification tasks.</li> <li>PassiveAggressiveRegressor for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>MultiLayerPerceptron Multi-layer Perceptron.</p> <p>Perceptron Linear Perceptron classification.</p> <p>StochasticGradientDescent Stochastic Gradient Descent.</p> <p></p>"}, {"location": "API/models/pa/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"PA\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: PA\nMetric: f1\n\n\nResults for PassiveAggressive:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9965\nTest evaluation --&gt; f1: 0.9504\nTime elapsed: 5.512s\n-------------------------------------------------\nTime: 5.512s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.515s\n-------------------------------------\nPassiveAggressive --&gt; f1: 0.9504\n</code></pre>"}, {"location": "API/models/pa/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)max_iterIntDistribution(high=1500, log=False, low=500, step=50)lossCategoricalDistribution(choices=('hinge', 'squared_hinge'))averageCategoricalDistribution(choices=(True, False))</p> <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)max_iterIntDistribution(high=1500, log=False, low=500, step=50)lossCategoricalDistribution(choices=('epsilon_insensitive', 'squared_epsilon_insensitive'))averageCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/pa/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/pa/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/pa/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/pa/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/perc/", "title": "Perceptron", "text": "<p>Perc needs scaling allows validation</p> <p>The Perceptron is a simple classification algorithm suitable for large scale learning. By default:</p> <ul> <li>It does not require a learning rate.</li> <li>It is not regularized (penalized).</li> <li>It updates its model only on mistakes.</li> </ul> <p>The last characteristic implies that the Perceptron is slightly faster to train than StochasticGradientDescent with the hinge loss and that the resulting models are sparser.</p> <p>Corresponding estimators are:</p> <ul> <li>Perceptron for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>MultiLayerPerceptron Multi-layer Perceptron.</p> <p>PassiveAggressive Passive Aggressive.</p> <p>StochasticGradientDescent Stochastic Gradient Descent.</p> <p></p>"}, {"location": "API/models/perc/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Perc\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Perc\nMetric: f1\n\n\nResults for Perceptron:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9577\nTime elapsed: 5.509s\n-------------------------------------------------\nTime: 5.509s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.512s\n-------------------------------------\nPerceptron --&gt; f1: 0.9577\n</code></pre>"}, {"location": "API/models/perc/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParameterspenaltyCategoricalDistribution(choices=(None, 'l2', 'l1', 'elasticnet'))alphaFloatDistribution(high=10.0, log=True, low=0.0001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=1500, log=False, low=500, step=50)eta0FloatDistribution(high=10.0, log=True, low=0.01, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/perc/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/perc/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/perc/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/perc/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/pt/", "title": "PolynomialTrend", "text": "<p>PT native multioutput</p> <p>Forecast time series data with a polynomial trend, using a sklearn LinearRegression class to regress values of time series on index, after extraction of polynomial features.</p> <p>Corresponding estimators are:</p> <ul> <li>PolynomialTrendForecaster for forecasting tasks.</li> </ul> <p></p> <p>See Also</p> <p>ARIMA Autoregressive Integrated Moving Average Model.</p> <p>ETS ETS model with automatic fitting capabilities.</p> <p>NaiveForecaster Naive Forecaster.</p> <p></p>"}, {"location": "API/models/pt/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; atom = ATOMForecaster(y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"PT\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: PT\nMetric: mape\n\n\nResults for PolynomialTrend:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.1196\nTest evaluation --&gt; mape: -0.1181\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.019s\n-------------------------------------\nPolynomialTrend --&gt; mape: -0.1181\n</code></pre>"}, {"location": "API/models/pt/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParametersdegreeIntDistribution(high=5, log=False, low=1, step=1)with_interceptCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/pt/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/pt/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/pt/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/pt/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_interval</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>marginal: bool, default=True Whether returned distribution is marginal by time index. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnssktime.proba.Normal Predicted distribution. </p> <p></p> <p>method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_quantiles</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_residuals</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>y</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_var</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at. <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of y with respect to a ground truth. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/qda/", "title": "QuadraticDiscriminantAnalysis", "text": "<p>QDA</p> <p>Quadratic Discriminant Analysis is a classifier with a quadratic decision boundary, generated by fitting class conditional densities to the data and using Bayes\u2019 rule. The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix.</p> <p>Corresponding estimators are:</p> <ul> <li>QuadraticDiscriminantAnalysis for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>LinearDiscriminantAnalysis Linear Discriminant Analysis.</p> <p>LogisticRegression Logistic Regression.</p> <p>RadiusNearestNeighbors Radius Nearest Neighbors.</p> <p></p>"}, {"location": "API/models/qda/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"QDA\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: QDA\nMetric: f1\n\n\nResults for QuadraticDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9809\nTest evaluation --&gt; f1: 0.9504\nTime elapsed: 0.023s\n-------------------------------------------------\nTime: 0.023s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.026s\n-------------------------------------\nQuadraticDiscriminantAnalysis --&gt; f1: 0.9504\n</code></pre>"}, {"location": "API/models/qda/#hyperparameters", "title": "Hyperparameters", "text": "<p>Parametersreg_paramFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p></p> <p></p>"}, {"location": "API/models/qda/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/qda/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/qda/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/qda/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/rf/", "title": "RandomForest", "text": "<p>RF accept sparse native multilabel native multioutput supports acceleration</p> <p>Random forests are an ensemble learning method that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. Random forests correct for decision trees' habit of overfitting to their training set.</p> <p>Corresponding estimators are:</p> <ul> <li>RandomForestClassifier for classification tasks.</li> <li>RandomForestRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p>Warning</p> <p>cuML's implementation of RandomForestClassifier only supports predictions on dtype <code>float32</code>. Convert all dtypes before calling atom's run method to avoid exceptions.</p> <p></p> <p>See Also</p> <p>DecisionTree Single Decision Tree.</p> <p>ExtraTrees Extremely Randomized Trees.</p> <p>HistGradientBoosting Histogram-based Gradient Boosting Machine.</p> <p></p>"}, {"location": "API/models/rf/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"RF\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9524\nTime elapsed: 0.232s\n-------------------------------------------------\nTime: 0.232s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.236s\n-------------------------------------\nRandomForest --&gt; f1: 0.9524\n</code></pre>"}, {"location": "API/models/rf/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> cpugpu <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> sklearnsklearnexcuml <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> cpugpu <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p></p> <p></p>"}, {"location": "API/models/rf/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/rf/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/rf/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/rf/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/ridge/", "title": "Ridge", "text": "<p>Ridge needs scaling accept sparse native multilabel supports acceleration</p> <p>If classifier, it first converts the target values into {-1, 1} and then treats the problem as a regression task.</p> <p>Corresponding estimators are:</p> <ul> <li>RidgeClassifier for classification tasks.</li> <li>Ridge for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p>Warning</p> <p>Engines <code>sklearnex</code> and <code>cuml</code> are only available for regression tasks.</p> <p></p> <p>See Also</p> <p>BayesianRidge Bayesian ridge regression.</p> <p>ElasticNet Linear Regression with elasticnet regularization.</p> <p>Lasso Linear Regression with lasso regularization.</p> <p></p>"}, {"location": "API/models/ridge/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Ridge\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Ridge\nMetric: r2\n\n\nResults for Ridge:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6067\nTest evaluation --&gt; r2: 0.6028\nTime elapsed: 0.136s\n-------------------------------------------------\nTime: 0.136s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.137s\n-------------------------------------\nRidge --&gt; r2: 0.6028\n</code></pre>"}, {"location": "API/models/ridge/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> cpugpu <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> sklearnsklearnexcuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> cpugpu <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> <p></p> <p></p>"}, {"location": "API/models/ridge/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ridge/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/ridge/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/ridge/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/rnn/", "title": "RadiusNearestNeighbors", "text": "<p>RNN needs scaling accept sparse native multilabel native multioutput</p> <p>Radius Nearest Neighbors implements the nearest neighbors vote, where the neighbors are selected from within a given radius. For regression, the target is predicted by local interpolation of the targets associated of the nearest neighbors in the training set.</p> <p>Warning</p> <ul> <li>The <code>radius</code> parameter should be tuned to the data at hand or   the model will perform poorly.</li> <li>If outliers are detected, the estimator raises an exception   unless <code>est_params={\"outlier_label\": \"most_frequent\"}</code> is used.</li> </ul> <p>Corresponding estimators are:</p> <ul> <li>RadiusNeighborsClassifier for classification tasks.</li> <li>RadiusNeighborsRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>KNearestNeighbors K-Nearest Neighbors.</p> <p>LinearDiscriminantAnalysis Linear Discriminant Analysis.</p> <p>QuadraticDiscriminantAnalysis Quadratic Discriminant Analysis.</p> <p></p>"}, {"location": "API/models/rnn/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\n...     models=\"RNN\",\n...     metric=\"f1\",\n...     est_params={\"outlier_label\": \"most_frequent\"},\n...     verbose=2,\n... )\n\n\nTraining ========================= &gt;&gt;\nModels: RNN\nMetric: f1\n\n\nResults for RadiusNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.7717\nTime elapsed: 0.091s\n-------------------------------------------------\nTime: 0.091s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.094s\n-------------------------------------\nRadiusNearestNeighbors --&gt; f1: 0.7717 ~\n</code></pre>"}, {"location": "API/models/rnn/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParametersradiusFloatDistribution(high=100.0, log=False, low=0.01, step=None)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p>ParametersradiusFloatDistribution(high=100.0, log=False, low=0.01, step=None)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p></p> <p></p>"}, {"location": "API/models/rnn/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/rnn/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/rnn/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/rnn/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/sgd/", "title": "StochasticGradientDescent", "text": "<p>SGD needs scaling accept sparse allows validation</p> <p>Stochastic Gradient Descent is a simple yet very efficient approach to fitting linear classifiers and regressors under convex loss functions. Even though SGD has been around in the machine learning community for a long time, it has received a considerable amount of attention just recently in the context of large-scale learning.</p> <p>Corresponding estimators are:</p> <ul> <li>SGDClassifier for classification tasks.</li> <li>SGDRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>MultiLayerPerceptron Multi-layer Perceptron.</p> <p>PassiveAggressive Passive Aggressive.</p> <p>SupportVectorMachine Support Vector Machine.</p> <p></p>"}, {"location": "API/models/sgd/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"SGD\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: SGD\nMetric: f1\n\n\nResults for StochasticGradientDescent:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9948\nTest evaluation --&gt; f1: 0.9722\nTime elapsed: 5.506s\n-------------------------------------------------\nTime: 5.506s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.509s\n-------------------------------------\nStochasticGradientDescent --&gt; f1: 0.9722\n</code></pre>"}, {"location": "API/models/sgd/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParameterslossCategoricalDistribution(choices=('hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'))penaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))alphaFloatDistribution(high=1.0, log=True, low=0.0001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=1500, log=False, low=500, step=50)epsilonFloatDistribution(high=1.0, log=True, low=0.0001, step=None)learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'optimal', 'adaptive'))eta0FloatDistribution(high=10.0, log=True, low=0.01, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)averageCategoricalDistribution(choices=(True, False))</p> <p>ParameterslossCategoricalDistribution(choices=('squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'))penaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))alphaFloatDistribution(high=1.0, log=True, low=0.0001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=1500, log=False, low=500, step=50)epsilonFloatDistribution(high=1.0, log=True, low=0.0001, step=None)learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'optimal', 'adaptive'))eta0FloatDistribution(high=10.0, log=True, low=0.01, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)averageCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/sgd/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/sgd/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/sgd/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/sgd/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/svm/", "title": "SupportVectorMachine", "text": "<p>SVM needs scaling accept sparse supports acceleration</p> <p>The implementation of the Support Vector Machine is based on libsvm. The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples. For large datasets consider using a LinearSVM or a StochasticGradientDescent model instead.</p> <p>Corresponding estimators are:</p> <ul> <li>SVC for classification tasks.</li> <li>SVR for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>LinearSVM Linear Support Vector Machine.</p> <p>MultiLayerPerceptron Multi-layer Perceptron.</p> <p>StochasticGradientDescent Stochastic Gradient Descent.</p> <p></p>"}, {"location": "API/models/svm/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"SVM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: SVM\nMetric: f1\n\n\nResults for SupportVectorMachine:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9913\nTest evaluation --&gt; f1: 0.979\nTime elapsed: 0.095s\n-------------------------------------------------\nTime: 0.095s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.098s\n-------------------------------------\nSupportVectorMachine --&gt; f1: 0.979\n</code></pre>"}, {"location": "API/models/svm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> cpugpu <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> sklearnsklearnexcuml <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> cpugpu <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/svm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/svm/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/svm/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/svm/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/tree/", "title": "DecisionTree", "text": "<p>Tree accept sparse native multilabel native multioutput</p> <p>A single decision tree classifier/regressor.</p> <p>Corresponding estimators are:</p> <ul> <li>DecisionTreeClassifier for classification tasks.</li> <li>DecisionTreeRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>ExtraTree Extremely Randomized Tree.</p> <p>ExtraTrees Extremely Randomized Trees.</p> <p>RandomForest Random Forest.</p> <p></p>"}, {"location": "API/models/tree/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Tree\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Tree\nMetric: f1\n\n\nResults for DecisionTree:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9589\nTime elapsed: 0.032s\n-------------------------------------------------\nTime: 0.032s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.035s\n-------------------------------------\nDecisionTree --&gt; f1: 0.9589\n</code></pre>"}, {"location": "API/models/tree/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParameterscriterionCategoricalDistribution(choices=('gini', 'entropy'))splitterCategoricalDistribution(choices=('best', 'random'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>ParameterscriterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'friedman_mse', 'poisson'))splitterCategoricalDistribution(choices=('best', 'random'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p></p> <p></p>"}, {"location": "API/models/tree/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/tree/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/tree/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/tree/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/xgb/", "title": "XGBoost", "text": "<p>XGB needs scaling accept sparse allows validation supports acceleration</p> <p>XGBoost is an optimized distributed gradient boosting model designed to be highly efficient, flexible and portable. XGBoost provides a parallel tree boosting that solve many data science problems in a fast and accurate way.</p> <p>Corresponding estimators are:</p> <ul> <li>XGBClassifier for classification tasks.</li> <li>XGBRegressor for regression tasks.</li> </ul> <p>Read more in XGBoost's documentation.</p> <p></p> <p>See Also</p> <p>CatBoost Cat Boosting Machine.</p> <p>GradientBoostingMachine Gradient Boosting Machine.</p> <p>LightGBM Light Gradient Boosting Machine.</p> <p></p>"}, {"location": "API/models/xgb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"XGB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: XGB\nMetric: f1\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9583\nTime elapsed: 0.401s\n-------------------------------------------------\nTime: 0.401s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.404s\n-------------------------------------\nXGBoost --&gt; f1: 0.9583\n</code></pre>"}, {"location": "API/models/xgb/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=20, log=False, low=1, step=1)gammaFloatDistribution(high=1.0, log=False, low=0.0, step=None)min_child_weightIntDistribution(high=10, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=20, log=False, low=1, step=1)gammaFloatDistribution(high=1.0, log=False, low=0.0, step=None)min_child_weightIntDistribution(high=10, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/xgb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/xgb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/xgb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/xgb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/nlp/textcleaner/", "title": "TextCleaner", "text": "<p>class atom.nlp.TextCleaner(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, verbose=0, logger=None)[source]Applies standard text cleaning to the corpus.</p> <p>Transformations include normalizing characters and dropping noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised.</p> <p>This class can be accessed from atom through the textclean method. Read more in the user guide.</p> <p>Parametersdecode: bool, default=True Whether to decode unicode characters to their ascii representations. <p>lower_case: bool, default=True Whether to convert all characters to lower case. <p>drop_email: bool, default=True Whether to drop email addresses from the text. <p>regex_email: str, default=None Regex used to search for email addresses. If None, it uses <code>r\"[\\w.-]+@[\\w-]+\\.[\\w.-]+\"</code>. <p>drop_url: bool, default=True Whether to drop URL links from the text. <p>regex_url: str, default=None Regex used to search for URLs. If None, it uses <code>r\"https?://\\S+|www\\.\\S+\"</code>. <p>drop_html: bool, default=True Whether to drop HTML tags from the text. This option is particularly useful if the data was scraped from a website. <p>regex_html: str, default=None Regex used to search for html tags. If None, it uses <code>r\"&lt;.*?&gt;\"</code>. <p>drop_emoji: bool, default=True Whether to drop emojis from the text. <p>regex_emoji: str, default=None Regex used to search for emojis. If None, it uses <code>r\":[a-z_]+:\"</code>. <p>drop_number: bool, default=True Whether to drop numbers from the text. <p>regex_number: str, default=None Regex used to search for numbers. If None, it uses <code>r\"\\b\\d+\\b\".</code> Note that numbers adjacent to letters are not removed. <p>drop_punctuation: bool, default=True Whether to drop punctuations from the text. Characters considered punctuation are <code>!\"#$%&amp;'()*+,-./:;&lt;=&gt;?@[\\]^_</code>~`. <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p></p> <p></p> <p>See Also</p> <p>TextNormalizer Normalize the corpus.</p> <p>Tokenizer Tokenize the corpus.</p> <p>Vectorizer Vectorize text data.</p> <p></p>"}, {"location": "API/nlp/textcleaner/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import fetch_20newsgroups\n\n&gt;&gt;&gt; X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n&gt;&gt;&gt; X = np.array(X).reshape(-1, 1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n                                                 corpus  target\n0     From: fabian@vivian.w.open.de (Fabian Hoppe)\\n...       1\n1     From: nyeda@cnsvax.uwec.edu (David Nye)\\nSubje...       0\n2     From: urathi@net4.ICS.UCI.EDU (Unmesh Rathi)\\n...       1\n3     From: inoue@crd.yokogawa.co.jp (Inoue Takeshi)...       1\n4     From: sandvik@newton.apple.com (Kent Sandvik)\\...       0\n...                                                 ...     ...\n1662  From: kutluk@ccl.umist.ac.uk (Kutluk Ozguven)\\...       0\n1663  From: dmp1@ukc.ac.uk (D.M.Procida)\\nSubject: R...       2\n1664  From: tdunbar@vtaix.cc.vt.edu (Thomas Dunbar)\\...       1\n1665  From: dmp@fig.citib.com (Donna M. Paino)\\nSubj...       2\n1666  From: cdm@pmafire.inel.gov (Dale Cook)\\nSubjec...       2\n\n[1667 rows x 2 columns]\n\n\n&gt;&gt;&gt; atom.textclean(verbose=2)\n\nFitting TextCleaner...\nCleaning the corpus...\n --&gt; Decoding unicode characters to ascii.\n --&gt; Converting text to lower case.\n --&gt; Dropping emails from documents.\n --&gt; Dropping URL links from documents.\n --&gt; Dropping HTML tags from documents.\n --&gt; Dropping emojis from documents.\n --&gt; Dropping numbers from documents.\n --&gt; Dropping punctuation from the text.\n\n\n&gt;&gt;&gt; print(atom.dataset)\n\n                                                 corpus  target\n0     from  fabian hoppe\\nsubject searching cadsoftw...       1\n1     from  david nye\\nsubject re after  years can w...       0\n2     from  unmesh rathi\\nsubject motif and intervie...       1\n3     from  inoue takeshi\\nsubject how to see charac...       1\n4     from  kent sandvik\\nsubject re slavery was re ...       0\n...                                                 ...     ...\n1662  from  kutluk ozguven\\nsubject re jewish settle...       0\n1663  from  dmprocida\\nsubject re homeopathy a respe...       2\n1664  from  thomas dunbar\\nsubject re x toolkits\\nsu...       1\n1665  from  donna m paino\\nsubject psoriatic arthrit...       2\n1666  from  dale cook\\nsubject re morbus meniere  is...       2\n\n[1667 rows x 2 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom.nlp import TextCleaner\n&gt;&gt;&gt; from sklearn.datasets import fetch_20newsgroups\n\n&gt;&gt;&gt; X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n&gt;&gt;&gt; X = np.array(X).reshape(-1, 1)\n\n&gt;&gt;&gt; textcleaner = TextCleaner(verbose=2)\n&gt;&gt;&gt; X = textcleaner.transform(X)\n\nCleaning the corpus...\n --&gt; Decoding unicode characters to ascii.\n --&gt; Converting text to lower case.\n --&gt; Dropping emails from documents.\n --&gt; Dropping URL links from documents.\n --&gt; Dropping HTML tags from documents.\n --&gt; Dropping emojis from documents.\n --&gt; Dropping numbers from documents.\n --&gt; Dropping punctuation from the text.\n\n\n&gt;&gt;&gt; print(X)\n\n                                                 corpus\n0     from  mark a deloura\\nsubject looking for x wi...\n1     from  der mouse\\nsubject re creating  bit wind...\n2     from  keith m ryan\\nsubject re where are they ...\n3     from  steven grimm\\nsubject re opinions on all...\n4     from  peter kaminski\\nsubject re krillean phot...\n...                                                 ...\n1662  from donald mackie \\nsubject re seeking advice...\n1663  from  gordon banks\\nsubject re update help was...\n1664  from  keith m ryan\\nsubject re political athei...\n1665  from  benedikt rosenau\\nsubject re biblical ra...\n1666  from derrick j brashear \\nsubject mouseless op...\n\n[1667 rows x 1 columns]\n</code></pre>"}, {"location": "API/nlp/textcleaner/#methods", "title": "Methods", "text": "<p>fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformApply the transformations to the data.</p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Do nothing.</p> <p>Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Apply the transformations to the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents. <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed corpus. </p> <p></p>"}, {"location": "API/nlp/textnormalizer/", "title": "TextNormalizer", "text": "<p>class atom.nlp.TextNormalizer(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, verbose=0, logger=None)[source]Normalize the corpus.</p> <p>Convert words to a more uniform standard. The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.</p> <p>This class can be accessed from atom through the textnormalize method. Read more in the user guide.</p> <p>Parametersstopwords: bool or str, default=True Whether to remove a predefined dictionary of stopwords. <ul> <li>If False: Don't remove any predefined stopwords.</li> <li>If True: Drop predefined english stopwords from the text.</li> <li>If str: Language from <code>nltk.corpus.stopwords.words</code>.</li> </ul> <p>custom_stopwords: sequence or None, default=None Custom stopwords to remove from the text. <p>stem: bool or str, default=False Whether to apply stemming using SnowballStemmer. <ul> <li>If False: Don't apply stemming.</li> <li>If True: Apply stemmer based on the english language.</li> <li>If str: Language from <code>SnowballStemmer.languages</code>.</li> </ul> <p>lemmatize: bool, default=True Whether to apply lemmatization using WordNetLemmatizer. <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>Attributesfeature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>TextCleaner Applies standard text cleaning to the corpus.</p> <p>Tokenizer Tokenize the corpus.</p> <p>Vectorizer Vectorize text data.</p> <p></p>"}, {"location": "API/nlp/textnormalizer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n\n&gt;&gt;&gt; X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n&gt;&gt;&gt; y = [1, 0, 0, 1, 1, 1, 0, 0]\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, test_size=2, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n                               corpus  target\n0                            new york       0\n1                     another line...       1\n2                    New york is nice       0\n3  new york is larger than washington       1\n4                    running the test       0\n5                   I \u00e0m in ne'w york       1\n6                      this is a test       0\n7            hi there this is a test!       1\n\n\n&gt;&gt;&gt; atom.textnormalize(stopwords=\"english\", lemmatize=True, verbose=2)\n\nFitting TextNormalizer...\nNormalizing the corpus...\n --&gt; Dropping stopwords.\n --&gt; Applying lemmatization.\n\n\n&gt;&gt;&gt; print(atom.dataset)\n\n                           corpus  target\n0                     [new, york]       0\n1              [another, line...]       1\n2               [New, york, nice]       0\n3  [new, york, large, washington]       1\n4                     [run, test]       0\n5             [I, \u00e0m, ne'w, york]       1\n6                          [test]       0\n7                     [hi, test!]       1\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.nlp import TextNormalizer\n\n&gt;&gt;&gt; X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n\n&gt;&gt;&gt; textnormalizer = TextNormalizer(\n...     stopwords=\"english\",\n...     lemmatize=True,\n...     verbose=2,\n... )\n&gt;&gt;&gt; X = textnormalizer.transform(X)\n\nNormalizing the corpus...\n --&gt; Dropping stopwords.\n --&gt; Applying lemmatization.\n\n\n&gt;&gt;&gt; print(X)\n\n                           corpus\n0             [I, \u00e0m, ne'w, york]\n1               [New, york, nice]\n2                     [new, york]\n3                     [hi, test!]\n4              [another, line...]\n5  [new, york, large, washington]\n6                     [run, test]\n7                          [test]\n</code></pre>"}, {"location": "API/nlp/textnormalizer/#methods", "title": "Methods", "text": "<p>fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformNormalize the text.</p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Do nothing.</p> <p>Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Normalize the text.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents. <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed corpus. </p> <p></p>"}, {"location": "API/nlp/tokenizer/", "title": "Tokenizer", "text": "<p>class atom.nlp.Tokenizer(bigram_freq=None, trigram_freq=None, quadgram_freq=None, verbose=0, logger=None)[source]Tokenize the corpus.</p> <p>Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>This class can be accessed from atom through the tokenize method. Read more in the user guide.</p> <p>Parametersbigram_freq: int, float or None, default=None Frequency threshold for bigram creation. <ul> <li>If None: Don't create any bigrams.</li> <li>If int: Minimum number of occurrences to make a bigram.</li> <li>If float: Minimum frequency fraction to make a bigram.</li> </ul> <p>trigram_freq: int, float or None, default=None Frequency threshold for trigram creation. <ul> <li>If None: Don't create any trigrams.</li> <li>If int: Minimum number of occurrences to make a trigram.</li> <li>If float: Minimum frequency fraction to make a trigram.</li> </ul> <p>quadgram_freq: int, float or None, default=None Frequency threshold for quadgram creation. <ul> <li>If None: Don't create any quadgrams.</li> <li>If int: Minimum number of occurrences to make a quadgram.</li> <li>If float: Minimum frequency fraction to make a quadgram.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>Attributesbigrams_: pd.DataFrame Created bigrams and their frequencies. <p>trigrams_: pd.DataFrame Created trigrams and their frequencies. <p>quadgrams_: pd.DataFrame Created quadgrams and their frequencies. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>TextCleaner Applies standard text cleaning to the corpus.</p> <p>TextNormalizer Normalize the corpus.</p> <p>Vectorizer Vectorize text data.</p> <p></p>"}, {"location": "API/nlp/tokenizer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n\n&gt;&gt;&gt; X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n&gt;&gt;&gt; y = [1, 0, 0, 1, 1, 1, 0, 0]\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, test_size=2, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n                               corpus  target\n0                            new york       0\n1                     another line...       1\n2                    New york is nice       0\n3  new york is larger than washington       1\n4                    running the test       0\n5                   I \u00e0m in ne'w york       1\n6                      this is a test       0\n7            hi there this is a test!       1\n\n\n&gt;&gt;&gt; atom.tokenize(verbose=2)\n\nFitting Tokenizer...\nTokenizing the corpus...\n\n\n&gt;&gt;&gt; print(atom.dataset)\n\n                                      corpus  target\n0                                [new, york]       0\n1                       [another, line, ...]       1\n2                      [New, york, is, nice]       0\n3  [new, york, is, larger, than, washington]       1\n4                       [running, the, test]       0\n5                [I, \u00e0m, in, ne, ', w, york]       1\n6                        [this, is, a, test]       0\n7          [hi, there, this, is, a, test, !]       1\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.nlp import Tokenizer\n\n&gt;&gt;&gt; X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n\n&gt;&gt;&gt; tokenizer = Tokenizer(bigram_freq=2, verbose=2)\n&gt;&gt;&gt; X = tokenizer.transform(X)\n\nTokenizing the corpus...\n --&gt; Creating 5 bigrams on 10 locations.\n\n\n&gt;&gt;&gt; print(X)\n\n                                     corpus\n0               [I, \u00e0m, in, ne, ', w, york]\n1                      [New, york_is, nice]\n2                                [new_york]\n3           [hi, there, this_is, a_test, !]\n4                      [another, line, ...]\n5  [new, york_is, larger, than, washington]\n6                      [running, the, test]\n7                         [this_is, a_test]\n</code></pre>"}, {"location": "API/nlp/tokenizer/#methods", "title": "Methods", "text": "<p>fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformTokenize the text.</p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Do nothing.</p> <p>Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Tokenize the text.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents. <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed corpus. </p> <p></p>"}, {"location": "API/nlp/vectorizer/", "title": "Vectorizer", "text": "<p>class atom.nlp.Vectorizer(strategy=\"bow\", return_sparse=True, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]Vectorize text data.</p> <p>Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix <code>corpus_</code>. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.</p> <p>This class can be accessed from atom through the vectorize method. Read more in the user guide.</p> <p>Parametersstrategy: str, default=\"bow\" Strategy with which to vectorize the text. Choose from: <ul> <li>\"bow\": Bag of Words.</li> <li>\"tfidf\": Term Frequency - Inverse Document Frequency.</li> <li>\"hashing\": Vectorize to a matrix of token occurrences.</li> </ul> <p>return_sparse: bool, default=True Whether to return the transformation output as a dataframe of sparse arrays. Must be False when there are other columns in X (besides <code>corpus</code>) that are non-sparse. <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>**kwargs Additional keyword arguments for the <code>strategy</code> estimator. <p>Attributes[strategy]_: sklearn transformer Estimator instance (lowercase strategy) used to vectorize the corpus, e.g., <code>vectorizer.tfidf</code> for the tfidf strategy. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>TextCleaner Applies standard text cleaning to the corpus.</p> <p>TextNormalizer Normalize the corpus.</p> <p>Tokenizer Tokenize the corpus.</p> <p></p>"}, {"location": "API/nlp/vectorizer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n\n&gt;&gt;&gt; X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n&gt;&gt;&gt; y = [1, 0, 0, 1, 1, 1, 0, 0]\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, test_size=2, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n                               corpus  target\n0                            new york       0\n1                     another line...       1\n2                    New york is nice       0\n3  new york is larger than washington       1\n4                    running the test       0\n5                   I \u00e0m in ne'w york       1\n6                      this is a test       0\n7            hi there this is a test!       1\n\n\n&gt;&gt;&gt; atom.vectorize(strategy=\"tfidf\", verbose=2)\n\nFitting Vectorizer...\nVectorizing the corpus...\n\n\n&gt;&gt;&gt; print(atom.dataset)\n\n   corpus_another  corpus_in  corpus_is  corpus_larger  corpus_line  corpus_ne  corpus_new  corpus_nice  corpus_running  corpus_test  corpus_than  corpus_the  corpus_washington  corpus_york  corpus_\u00e0m  target\n0        0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.759339     0.000000         0.00000     0.000000     0.000000     0.00000           0.000000     0.650696   0.000000       0\n1        0.707107   0.000000   0.000000       0.000000     0.707107   0.000000    0.000000     0.000000         0.00000     0.000000     0.000000     0.00000           0.000000     0.000000   0.000000       1\n2        0.000000   0.000000   0.518242       0.000000     0.000000   0.000000    0.437535     0.631991         0.00000     0.000000     0.000000     0.00000           0.000000     0.374934   0.000000       0\n3        0.000000   0.000000   0.386401       0.471212     0.000000   0.000000    0.326226     0.000000         0.00000     0.000000     0.471212     0.00000           0.471212     0.279551   0.000000       1\n4        0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.000000     0.000000         0.57735     0.577350     0.000000     0.57735           0.000000     0.000000   0.000000       0\n5        0.000000   0.546199   0.000000       0.000000     0.000000   0.546199    0.000000     0.000000         0.00000     0.000000     0.000000     0.00000           0.000000     0.324037   0.546199       1\n6        0.000000   0.000000   0.634086       0.000000     0.000000   0.000000    0.000000     0.000000         0.00000     0.773262     0.000000     0.00000           0.000000     0.000000   0.000000       0\n7        0.000000   0.000000   0.634086       0.000000     0.000000   0.000000    0.000000     0.000000         0.00000     0.773262     0.000000     0.00000           0.000000     0.000000   0.000000       1\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.nlp import Vectorizer\n\n&gt;&gt;&gt; X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n\n&gt;&gt;&gt; vectorizer = Vectorizer(strategy=\"tfidf\", verbose=2)\n&gt;&gt;&gt; X = vectorizer.fit_transform(X)\n\nFitting Vectorizer...\nVectorizing the corpus...\n\n\n&gt;&gt;&gt; print(X)\n\n   corpus_another  corpus_hi  corpus_in  corpus_is  corpus_larger  corpus_line  corpus_ne  corpus_new  corpus_nice  corpus_running  corpus_test  corpus_than  corpus_the  corpus_there  corpus_this  corpus_washington  corpus_york  corpus_\u00e0m\n0        0.000000   0.000000   0.542162   0.000000       0.000000     0.000000   0.542162    0.000000     0.000000        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.343774   0.542162\n1        0.000000   0.000000   0.000000   0.415657       0.000000     0.000000   0.000000    0.474072     0.655527        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.415657   0.000000\n2        0.000000   0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.751913     0.000000        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.659262   0.000000\n3        0.000000   0.525049   0.000000   0.332923       0.000000     0.000000   0.000000    0.000000     0.000000        0.000000     0.379712     0.000000    0.000000      0.525049     0.440032           0.000000     0.000000   0.000000\n4        0.707107   0.000000   0.000000   0.000000       0.000000     0.707107   0.000000    0.000000     0.000000        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.000000   0.000000\n5        0.000000   0.000000   0.000000   0.304821       0.480729     0.000000   0.000000    0.347660     0.000000        0.000000     0.000000     0.480729    0.000000      0.000000     0.000000           0.480729     0.304821   0.000000\n6        0.000000   0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.000000     0.000000        0.629565     0.455297     0.000000    0.629565      0.000000     0.000000           0.000000     0.000000   0.000000\n7        0.000000   0.000000   0.000000   0.497041       0.000000     0.000000   0.000000    0.000000     0.000000        0.000000     0.566893     0.000000    0.000000      0.000000     0.656949           0.000000     0.000000   0.000000\n</code></pre>"}, {"location": "API/nlp/vectorizer/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformVectorize the text.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents. <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Vectorize the text.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents. <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed corpus. </p> <p></p>"}, {"location": "API/pipeline/pipeline/", "title": "Pipeline", "text": "<p>class atom.pipeline.Pipeline(steps, memory=None, verbose=0)[source]Pipeline of transforms with a final estimator.</p> <p>Sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be transformsers, that is, they must implement <code>fit</code> and <code>transform</code> methods. The final estimator only needs to implement <code>fit</code>. The transformers in the pipeline can be cached using the <code>memory</code> parameter.</p> <p>The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. For this, it enables setting parameters of the various steps using their names and the parameter name separated by <code>__</code>, as in the example below. A step's estimator may be replaced entirely by setting the parameter with its name to another estimator, or a transformer removed by setting it to <code>passthrough</code> or <code>None</code>.</p> <p>Read more in sklearn's the user guide.</p> <p>Info</p> <p>This class behaves similarly to sklearn's pipeline, and additionally:</p> <ul> <li>Works with an empty pipeline.</li> <li>Accepts transformers that drop rows.</li> <li>Accepts transformers that only are fitted on a subset of the   provided dataset.</li> <li>Accepts transformers that apply only on the target column.</li> <li>Uses transformers that are only applied on the training set   to fit the pipeline, not to make predictions on new data.</li> <li>The instance is considered fitted at initialization if all   the underlying transformers/estimator in the pipeline are.</li> <li>It returns attributes from the final estimator if they are   not of the Pipeline.</li> <li>The last transformer is also cached.</li> </ul> <p>Warning</p> <p>This Pipeline only works with estimators whose parameters for fit, transform, predict, etc... are named <code>X</code> and/or <code>y</code>.</p> <p>Parameterssteps: list of tuple List of (name, transform) tuples (implementing <code>fit</code>/<code>transform</code>) that are chained in sequential order. <p>memory: str, Memory or None, default=None Used to cache the fitted transformers of the pipeline. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute <code>named_steps</code> or <code>steps</code> to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time-consuming. <p>verbose: int or None, default=0 Verbosity level of the transformers in the pipeline. If None, it leaves them to their original verbosity. If &gt;0, the time elapsed while fitting each step is printed. <p>Attributesnamed_steps: Bunch Dictionary-like object, with the following attributes. Read-only attribute to access any step parameter by user given name. Keys are step names and values are steps parameters. <p>classes_: np.ndarray of shape (n_classes,) The class' labels. Only exist if the last step of the pipeline is a classifier. <p>feature_names_in_: np.ndarray Names of features seen during first step <code>fit</code> method. <p>n_features_in_: int Number of features seen during first step <code>fit</code> method. <p></p> <p></p>"}, {"location": "API/pipeline/pipeline/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Initialize atom\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 165 (1.2%)\n\n\n\n&gt;&gt;&gt; # Apply data cleaning and feature engineering methods\n&gt;&gt;&gt; atom.scale()\n\nFitting Scaler...\nScaling features...\n\n&gt;&gt;&gt; atom.balance(strategy=\"smote\")\n\nOversampling with SMOTE...\n --&gt; Adding 116 samples to class 0.\n\n&gt;&gt;&gt; atom.feature_selection(strategy=\"rfe\", solver=\"lr\", n_features=22)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --&gt; rfe selected 22 features from the dataset.\n   --&gt; Dropping feature mean texture (rank 2).\n   --&gt; Dropping feature mean smoothness (rank 3).\n   --&gt; Dropping feature mean symmetry (rank 9).\n   --&gt; Dropping feature texture error (rank 7).\n   --&gt; Dropping feature smoothness error (rank 4).\n   --&gt; Dropping feature concavity error (rank 5).\n   --&gt; Dropping feature worst compactness (rank 8).\n   --&gt; Dropping feature worst fractal dimension (rank 6).\n\n\n&gt;&gt;&gt; # Train models\n&gt;&gt;&gt; atom.run(models=\"LR\")\n\n\nTraining ========================= &gt;&gt;\nModels: LR\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9913\nTest evaluation --&gt; f1: 0.9787\nTime elapsed: 0.030s\n-------------------------------------------------\nTime: 0.030s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.033s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9787\n\n\n&gt;&gt;&gt; # Get the pipeline and make predictions\n&gt;&gt;&gt; pl = atom.lr.export_pipeline()\n&gt;&gt;&gt; print(pl.predict(X))\n\n[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n 1 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1\n 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1\n 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1\n 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0\n 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1\n 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0\n 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1\n 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1\n 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 0\n 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1\n 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1\n 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1\n 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0\n 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n 1 1 1 1 1 1 1 0 0 0 0 0 0 1]\n</code></pre>"}, {"location": "API/pipeline/pipeline/#methods", "title": "Methods", "text": "<p>decision_functionTransform, then decision_function of the final estimator.fitFit the pipeline.fit_predictTransform the data, and apply <code>fit_predict</code> with the final estimator.fit_transformFit the pipeline and transform the data.get_feature_names_outGet output feature names for transformation.get_paramsGet parameters for this estimator.inverse_transformInverse transform for each step in a reverse order.predictTransform, then predict of the final estimator.predict_log_probaTransform, then predict_log_proba of the final estimator.predict_probaTransform, then predict_proba of the final estimator.scoreTransform, then score of the final estimator.score_samplesTransform the data, and apply <code>score_samples</code> with the final estimator.set_outputSet the output container when <code>\"transform\"</code> and <code>\"fit_transform\"</code> are called.set_paramsSet the parameters of this estimator.transformTransform the data.</p> <p></p> <p>method decision_function(X)[source]Transform, then decision_function of the final estimator.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>Returnsnp.ndarray Predicted confidence scores. </p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Fit the pipeline.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, dict, sequence or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>Else: Array with shape=(n_samples,) to use as target.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_predict(X, y=None, **fit_params)[source]Transform the data, and apply <code>fit_predict</code> with the final estimator.</p> <p>ParametersX : iterable Training data. Must fulfill input requirements of first step of the pipeline. <p>y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. <p>**fit_params : dict of string -&gt; object Parameters passed to the <code>fit</code> method of each step, where each parameter name is prefixed such that parameter <code>p</code> for step <code>s</code> has key <code>s__p</code>. <p>Returnsy_pred : ndarray Result of calling <code>fit_predict</code> on the final estimator. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit the pipeline and transform the data.</p> <p>Call <code>fit</code> followed by <code>transform</code> on each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls the <code>transform</code> method. Only valid if the final estimator implements <code>transform</code>. This also works when the final estimator is <code>None</code>, in which case all prior transformations are applied.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the estimator only uses y. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_feature_names_out(input_features=None)[source]Get output feature names for transformation.</p> <p>Parametersinput_features : array-like of str or None, default=None Input features. <p>Returnsfeature_names_out : ndarray of str objects Transformed feature names. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : mapping of string to any Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Inverse transform for each step in a reverse order.</p> <p>All estimators in the pipeline must implement the <code>inverse_transform</code> method.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the pipeline only uses y. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method predict(X, **predict_params)[source]Transform, then predict of the final estimator.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>**predict_params Additional keyword arguments for the predict method. Note that while this may be used to return uncertainties from some models with return_std or return_cov, uncertainties that are generated by the transformations in the pipeline are not propagated to the final estimator. <p>Returnsnp.ndarray Predicted classes with shape=(n_samples,). </p> <p></p> <p>method predict_log_proba(X)[source]Transform, then predict_log_proba of the final estimator.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>Returnsnp.ndarray Predicted class log-probabilities. </p> <p></p> <p>method predict_proba(X)[source]Transform, then predict_proba of the final estimator.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>Returnsnp.ndarray Predicted class probabilities. </p> <p></p> <p>method score(X, y, sample_weight=None)[source]Transform, then score of the final estimator.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, dict, sequence <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>Else: Array with shape=(n_samples,) to use as target.</li> </ul> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>Returnsfloat Mean accuracy or r2 of self.predict(X) with respect to y. </p> <p></p> <p>method score_samples(X)[source]Transform the data, and apply <code>score_samples</code> with the final estimator.</p> <p>ParametersX : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. <p>Returnsy_score : ndarray of shape (n_samples,) Result of calling <code>score_samples</code> on the final estimator. </p> <p></p> <p>method set_output(transform=None)[source]Set the output container when <code>\"transform\"</code> and <code>\"fit_transform\"</code> are called.</p> <p>Parameterstransform : {\"default\", \"pandas\"}, default=None Configure output of <code>transform</code> and <code>fit_transform</code>. <ul> <li><code>\"default\"</code>: Default output format of a transformer</li> <li><code>\"pandas\"</code>: DataFrame output</li> <li><code>None</code>: Transform configuration is unchanged</li> </ul> <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method set_params(**kwargs)[source]Set the parameters of this estimator.</p> <p>Parameters**kwargs : dict Parameters of this estimator or parameters of estimators contained in <code>steps</code>. Parameters of the steps may be set using its name and the parameter name separated by a '__'. <p>Returnsself : object Pipeline class instance. </p> <p></p> <p>method transform(X=None, y=None, **kwargs)[source]Transform the data.</p> <p>Call <code>transform</code> on each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls the <code>transform</code> method. Only valid if the final estimator implements <code>transform</code>. This also works when the final estimator is <code>None</code>, in which case all prior transformations are applied.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the pipeline only uses y. <p>y: int, str, dict, sequence or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>Else: Array with shape=(n_samples,) to use as target.</li> </ul> <p>**kwargs Additional keyword arguments for the <code>_iter</code> inner method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p>"}, {"location": "API/plots/plot_calibration/", "title": "plot_calibration", "text": "<p>method plot_calibration(models=None, rows=\"test\", n_bins=10, target=0, title=None, legend=\"upper left\", figsize=(900, 900), filename=None, display=True)[source]Plot the calibration curve for a binary classifier.</p> <p>Well-calibrated classifiers are probabilistic classifiers for which the output of the <code>predict_proba</code> method can be directly interpreted as a confidence level. For instance, a calibrated (binary) classifier should classify the samples such that among the samples to which it gave a <code>predict_proba</code> value close to 0.8, approx. 80% actually belong to the positive class. Read more in sklearn's documentation.</p> <p>This figure shows two plots: the calibration curve, where the x-axis represents the average predicted probability in each bin and the y-axis is the fraction of positives, i.e., the proportion of samples whose class is the positive class (in each bin); and a distribution of all predicted probabilities of the classifier. This plot is available only for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Tip</p> <p>Use the calibrate method to calibrate the winning model.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>n_bins: int, default=10 Number of bins used for calibration. Minimum of 5 required. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 900) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_lift Plot the lift curve.</p> <p>plot_prc Plot the precision-recall curve.</p> <p>plot_roc Plot the Receiver Operating Characteristics curve.</p> <p></p>"}, {"location": "API/plots/plot_calibration/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"RF\", \"LGB\"])\n&gt;&gt;&gt; atom.plot_calibration()\n</code></pre>"}, {"location": "API/plots/plot_components/", "title": "plot_components", "text": "<p>method plot_components(show=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the explained variance ratio per component.</p> <p>Kept components are colored and discarded components are transparent. This plot is available only when feature selection was applied with strategy=\"pca\".</p> <p>Parametersshow: int or None, default=None Number of components to show. None to show all. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of components shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_pca Plot the explained variance ratio vs number of components.</p> <p>plot_rfecv Plot the rfecv results.</p> <p></p>"}, {"location": "API/plots/plot_components/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.feature_selection(\"pca\", n_features=5)\n&gt;&gt;&gt; atom.plot_components(show=10)\n</code></pre>"}, {"location": "API/plots/plot_confusion_matrix/", "title": "plot_confusion_matrix", "text": "<p>method plot_confusion_matrix(models=None, rows=\"test\", target=0, threshold=0.5, title=None, legend=\"upper right\", figsize=None, filename=None, display=True)[source]Plot a model's confusion matrix.</p> <p>For one model, the plot shows a heatmap. For multiple models, it compares TP, FP, FN and TN in a barplot (not implemented for multiclass classification tasks). This plot is available only for classification tasks.</p> <p>Tip</p> <p>Fill the <code>threshold</code> parameter with the result from the model's <code>get_best_threshold</code> method to optimize the results.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: hashable, segment or sequence, default=\"test\" Selection of rows on which to calculate the confusion matrix. <p>target: int or str, default=0 Target column to look at. Only for multioutput tasks. <p>threshold: float, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only for binary classification tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the plot's type. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_calibration Plot the calibration curve for a binary classifier.</p> <p>plot_threshold Plot metric performances against threshold values.</p> <p></p>"}, {"location": "API/plots/plot_confusion_matrix/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, test_size=0.4)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.lr.plot_confusion_matrix()  # For one model\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_confusion_matrix()  # For multiple models\n</code></pre>"}, {"location": "API/plots/plot_correlation/", "title": "plot_correlation", "text": "<p>method plot_correlation(columns=None, method=\"pearson\", title=None, legend=None, figsize=(800, 700), filename=None, display=True)[source]Plot a correlation matrix.</p> <p>Displays a heatmap showing the correlation between columns in the dataset. The colors red, blue and white stand for positive, negative, and no correlation respectively.</p> <p>Parameterscolumns: segment, sequence, dataframe or None, default=None Columns to plot. If None, plot all columns in the dataset. Selected categorical columns are ignored. <p>method: str, default=\"pearson\" Method of correlation. Choose from: pearson, kendall or spearman. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple, default=(800, 700) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_distribution Plot column distributions.</p> <p>plot_qq Plot a quantile-quantile plot.</p> <p>plot_relationships Plot pairwise relationships in a dataset.</p> <p></p>"}, {"location": "API/plots/plot_correlation/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.plot_correlation()\n</code></pre>"}, {"location": "API/plots/plot_det/", "title": "plot_det", "text": "<p>method plot_det(models=None, rows=\"test\", target=0, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the Detection Error Tradeoff curve.</p> <p>Read more about DET in sklearn's documentation. Only available for binary classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_gains Plot the cumulative gains curve.</p> <p>plot_roc Plot the Receiver Operating Characteristics curve.</p> <p>plot_prc Plot the precision-recall curve.</p> <p></p>"}, {"location": "API/plots/plot_det/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_det()\n</code></pre>"}, {"location": "API/plots/plot_distribution/", "title": "plot_distribution", "text": "<p>method plot_distribution(columns=0, distributions=\"kde\", show=None, title=None, legend=\"upper right\", figsize=None, filename=None, display=True)[source]Plot column distributions.</p> <ul> <li>For numerical columns, plot the probability density   distribution. Additionally, it's possible to plot any of   <code>scipy.stats</code> distributions fitted to the column.</li> <li>For categorical columns, plot the class distribution.   Only one categorical column can be plotted at the same time.</li> </ul> <p>Tip</p> <p>Use atom's distribution method to check which distribution fits the column best.</p> <p>Parameterscolumns: int, str, slice or sequence, default=0 Columns to plot. It's only possible to plot one categorical column. If more than one categorical column is selected, all categorical columns are ignored. <p>distributions: str, sequence or None, default=\"kde\" Distributions to fit. Only for numerical columns. <ul> <li>If None: No distribution is fit.</li> <li>If \"kde\": Fit a Gaussian kde distribution.</li> <li>Else: Name of a <code>scipy.stats</code> distribution.</li> </ul> <p>show: int or None, default=None Number of classes (ordered by number of occurrences) to show in the plot. If None, it shows all classes. Only for categorical columns. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None: No title is shown.</li> <li>If str: Text for the title.</li> <li>If dict: title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the plot's type. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_correlation Plot a correlation matrix.</p> <p>plot_qq Plot a quantile-quantile plot.</p> <p>plot_relationships Plot pairwise relationships in a dataset.</p> <p></p>"}, {"location": "API/plots/plot_distribution/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Add a categorical feature\n&gt;&gt;&gt; animals = [\"cat\", \"dog\", \"bird\", \"lion\", \"zebra\"]\n&gt;&gt;&gt; probabilities = [0.001, 0.1, 0.2, 0.3, 0.399]\n&gt;&gt;&gt; X[\"animals\"] = np.random.choice(animals, size=len(X), p=probabilities)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.plot_distribution(columns=[0, 1])\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_distribution(columns=0, distributions=[\"norm\", \"invgauss\"])\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_distribution(columns=\"animals\")\n</code></pre>"}, {"location": "API/plots/plot_edf/", "title": "plot_edf", "text": "<p>method plot_edf(models=None, metric=None, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot the Empirical Distribution Function of a study.</p> <p>Use this plot to analyze and improve hyperparameter search spaces. The EDF assumes that the value of the objective function is in accordance with the uniform distribution over the objective space. This plot is only available for models that ran hyperparameter tuning.</p> <p>Note</p> <p>Only complete trials are considered when plotting the EDF.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected. <p>metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). If str, add <code>+</code> between options to select more than one. If None, the metric used to run the pipeline is selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_hyperparameters Plot hyperparameter relationships in a study.</p> <p>plot_trials Plot the hyperparameter tuning trials.</p> <p></p>"}, {"location": "API/plots/plot_edf/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from optuna.distributions import IntDistribution\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n\n&gt;&gt;&gt; # Run three models with different search spaces\n&gt;&gt;&gt; atom.run(\n...     models=\"RF_1\",\n...     n_trials=20,\n...     ht_params={\"distributions\": {\"n_estimators\": IntDistribution(6, 10)}},\n... )\n&gt;&gt;&gt; atom.run(\n...     models=\"RF_2\",\n...     n_trials=20,\n...     ht_params={\"distributions\": {\"n_estimators\": IntDistribution(11, 15)}},\n... )\n&gt;&gt;&gt; atom.run(\n...     models=\"RF_3\",\n...     n_trials=20,\n...     ht_params={\"distributions\": {\"n_estimators\": IntDistribution(16, 20)}},\n... )\n\n&gt;&gt;&gt; atom.plot_edf()\n</code></pre>"}, {"location": "API/plots/plot_errors/", "title": "plot_errors", "text": "<p>method plot_errors(models=None, rows=\"test\", target=0, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot a model's prediction errors.</p> <p>Plot the actual targets from a set against the predicted values generated by the regressor. A linear fit is made on the data. The gray, intersected line shows the identity line. This plot can be useful to detect noise or heteroscedasticity along a range of the target domain. This plot is available only for regression tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multioutput tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_residuals Plot a model's residuals.</p> <p></p>"}, {"location": "API/plots/plot_errors/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_diabetes\n\n&gt;&gt;&gt; X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y)\n&gt;&gt;&gt; atom.run([\"OLS\", \"LGB\"])\n&gt;&gt;&gt; atom.plot_errors()\n</code></pre>"}, {"location": "API/plots/plot_evals/", "title": "plot_evals", "text": "<p>method plot_evals(models=None, dataset=\"test\", title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot evaluation curves.</p> <p>The evaluation curves are the main metric scores achieved by the models at every iteration of the training process. This plot is available only for models that allow in-training validation.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>dataset: str, default=\"test\" Data set for which to plot the evaluation curves. Use <code>+</code> between options to select more than one. Choose from: \"train\", \"test\". <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_trials Plot the hyperparameter tuning trials.</p> <p></p>"}, {"location": "API/plots/plot_evals/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"XGB\", \"LGB\"])\n&gt;&gt;&gt; atom.plot_evals()\n</code></pre>"}, {"location": "API/plots/plot_feature_importance/", "title": "plot_feature_importance", "text": "<p>method plot_feature_importance(models=None, show=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot a model's feature importance.</p> <p>The sum of importances for all features (per model) is 1. This plot is available only for models whose estimator has a <code>scores_</code>, <code>feature_importances_</code> or <code>coef</code> attribute.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_parshap Plot the partial correlation of shap values.</p> <p>plot_partial_dependence Plot the partial dependence of features.</p> <p>plot_permutation_importance Plot the feature permutation importance of models.</p> <p></p>"}, {"location": "API/plots/plot_feature_importance/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_feature_importance(show=10)\n</code></pre>"}, {"location": "API/plots/plot_forecast/", "title": "plot_forecast", "text": "<p>method plot_forecast(models=None, fh=\"test\", X=None, target=0, plot_interval=True, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot a time series with model forecasts.</p> <p>This plot is only available for forecasting tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. If no models are selected, only the target column is plotted. <p>fh: hashable, segment, sequence or ForecastingHorizon, default=\"test\" Forecast horizon for which to plot the predictions. <p>X: dataframe-like or None, default=None Exogenous time series corresponding to fh. This parameter is ignored if fh is a data set. <p>target: int or str, default=0 Target column to look at. Only for multivariate tasks. <p>plot_interval: bool, default=True Whether to plot prediction intervals instead of the exact prediction values. If True, the plotted estimators should have a <code>predict_interval</code> method. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_lift Plot the lift curve.</p> <p>plot_prc Plot the precision-recall curve.</p> <p>plot_roc Plot the Receiver Operating Characteristics curve.</p> <p></p>"}, {"location": "API/plots/plot_forecast/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; atom = ATOMForecaster(y, random_state=1)\n&gt;&gt;&gt; atom.plot_forecast()\n</code></pre> <pre><code>&gt;&gt;&gt; atom.run(\n...     models=\"arima\",\n...     est_params={\"order\": (1, 1, 0), \"seasonal_order\": (0, 1, 0, 12)},\n... )\n&gt;&gt;&gt; atom.plot_forecast()\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_forecast(fh=\"train+test\", plot_interval=False)\n</code></pre> <pre><code>&gt;&gt;&gt; # Forecast the next 4 years starting from the test set\n&gt;&gt;&gt; atom.plot_forecast(fh=range(1, 48))\n</code></pre>"}, {"location": "API/plots/plot_gains/", "title": "plot_gains", "text": "<p>method plot_gains(models=None, rows=\"test\", target=0, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the cumulative gains curve.</p> <p>This plot is available only for binary and multilabel classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_det Plot the Detection Error Tradeoff curve.</p> <p>plot_lift Plot the lift curve.</p> <p>plot_roc Plot the Receiver Operating Characteristics curve.</p> <p></p>"}, {"location": "API/plots/plot_gains/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_gains()\n</code></pre>"}, {"location": "API/plots/plot_hyperparameter_importance/", "title": "plot_hyperparameter_importance", "text": "<p>method plot_hyperparameter_importance(models=None, metric=0, show=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot a model's hyperparameter importance.</p> <p>The hyperparameter importances are calculated using the fANOVA importance evaluator. The sum of all importances for all parameters (per model) is 1. This plot is only available for models that ran hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected. <p>metric: int or str, default=0 Metric to plot (only for multi-metric runs). <p>show: int or None, default=None Number of hyperparameters (ordered by importance) to show. None to show all. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_feature_importance Plot a model's feature importance.</p> <p>plot_hyperparameters Plot hyperparameter relationships in a study.</p> <p>plot_trials Plot the hyperparameter tuning trials.</p> <p></p>"}, {"location": "API/plots/plot_hyperparameter_importance/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"ET\", \"RF\"], n_trials=10)\n&gt;&gt;&gt; atom.plot_hyperparameter_importance()\n</code></pre>"}, {"location": "API/plots/plot_hyperparameters/", "title": "plot_hyperparameters", "text": "<p>method plot_hyperparameters(models=None, params=(0, 1), metric=0, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot hyperparameter relationships in a study.</p> <p>A model's hyperparameters are plotted against each other. The corresponding metric scores are displayed in a contour plot. The markers are the trials in the study. This plot is only available for models that ran hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_hyperparameters()</code>. <p>params: str, segment or sequence, default=(0, 1) Hyperparameters to plot. Use a sequence or add <code>+</code> between options to select more than one. <p>metric: int or str, default=0 Metric to plot (only for multi-metric runs). <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_hyperparameter_importance Plot a model's hyperparameter importance.</p> <p>plot_parallel_coordinate Plot high-dimensional parameter relationships in a study.</p> <p>plot_trials Plot the hyperparameter tuning trials.</p> <p></p>"}, {"location": "API/plots/plot_hyperparameters/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\", n_trials=15)\n&gt;&gt;&gt; atom.plot_hyperparameters(params=(0, 1, 2))\n</code></pre>"}, {"location": "API/plots/plot_learning_curve/", "title": "plot_learning_curve", "text": "<p>method plot_learning_curve(models=None, metric=None, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the learning curve: score vs number of training samples.</p> <p>This plot is available only for models fitted using train sizing. Ensembles are ignored.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Use a sequence or add <code>+</code> between options to select more than one. If None, the metric used to run the pipeline is selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_results Plot the model results.</p> <p>plot_successive_halving Plot scores per iteration of the successive halving.</p> <p></p>"}, {"location": "API/plots/plot_learning_curve/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.train_sizing([\"LR\", \"RF\"], n_bootstrap=5)\n&gt;&gt;&gt; atom.plot_learning_curve()\n</code></pre>"}, {"location": "API/plots/plot_lift/", "title": "plot_lift", "text": "<p>method plot_lift(models=None, rows=\"test\", target=0, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the lift curve.</p> <p>Only available for binary classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_det Plot the Detection Error Tradeoff curve.</p> <p>plot_gains Plot the cumulative gains curve.</p> <p>plot_prc Plot the precision-recall curve.</p> <p></p>"}, {"location": "API/plots/plot_lift/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_lift()\n</code></pre>"}, {"location": "API/plots/plot_ngrams/", "title": "plot_ngrams", "text": "<p>method plot_ngrams(ngram=\"bigram\", rows=\"dataset\", show=10, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot n-gram frequencies.</p> <p>The text for the plot is extracted from the column named <code>corpus</code>. If there is no column with that name, an exception is raised. If the documents are not tokenized, the words are separated by spaces.</p> <p>Tip</p> <p>Use atom's tokenize method to separate the words creating n-grams based on their frequency in the corpus.</p> <p>Parametersngram: str or int, default=\"bigram\" Number of contiguous words to search for (size of n-gram). Choose from: word (1), bigram (2), trigram (3), quadgram (4). <p>rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows in the corpus to include in the search. <p>show: int or None, default=10 Number of n-grams (ordered by number of occurrences) to show in the plot. If none, show all n-grams (up to 200). <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of n-grams shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_wordcloud Plot a wordcloud from the corpus.</p> <p></p>"}, {"location": "API/plots/plot_ngrams/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import fetch_20newsgroups\n\n&gt;&gt;&gt; X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n&gt;&gt;&gt; X = np.array(X).reshape(-1, 1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.textclean()\n&gt;&gt;&gt; atom.textnormalize()\n&gt;&gt;&gt; atom.plot_ngrams()\n</code></pre>"}, {"location": "API/plots/plot_parallel_coordinate/", "title": "plot_parallel_coordinate", "text": "<p>method plot_parallel_coordinate(models=None, params=None, metric=0, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot high-dimensional parameter relationships in a study.</p> <p>Every line of the plot represents one trial. This plot is only available for models that ran hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_parallel_coordinate()</code>. <p>params: str, segment, sequence or None, default=None Hyperparameters to plot. Use a sequence or add <code>+</code> between options to select more than one. If None, all the model's hyperparameters are selected. <p>metric: int or str, default=0 Metric to plot (only for multi-metric runs). <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_edf Plot the Empirical Distribution Function of a study.</p> <p>plot_hyperparameter_importance Plot a model's hyperparameter importance.</p> <p>plot_hyperparameters Plot hyperparameter relationships in a study.</p> <p></p>"}, {"location": "API/plots/plot_parallel_coordinate/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"RF\", n_trials=15)\n&gt;&gt;&gt; atom.plot_parallel_coordinate(params=slice(1, 5))\n</code></pre>"}, {"location": "API/plots/plot_pareto_front/", "title": "plot_pareto_front", "text": "<p>method plot_pareto_front(models=None, metric=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot the Pareto front of a study.</p> <p>Shows the trial scores plotted against each other. The marker's colors indicate the trial number. This plot is only available for models with multi-metric runs and hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_pareto_front()</code>. <p>metric: str, sequence or None, default=None Metrics to plot.  Use a sequence or add <code>+</code> between options to select more than one. If None, the metrics used to run the pipeline are selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of metrics shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_edf Plot the Empirical Distribution Function of a study.</p> <p>plot_slice Plot the parameter relationship in a study.</p> <p>plot_trials Plot the hyperparameter tuning trials.</p> <p></p>"}, {"location": "API/plots/plot_pareto_front/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\n...     models=\"RF\",\n...     metric=[\"f1\", \"accuracy\", \"recall\"],\n...     n_trials=15,\n...  )\n&gt;&gt;&gt; atom.plot_pareto_front()\n</code></pre>"}, {"location": "API/plots/plot_parshap/", "title": "plot_parshap", "text": "<p>method plot_parshap(models=None, columns=None, target=1, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot the partial correlation of shap values.</p> <p>Plots the train and test correlation between the shap value of every feature with its target value, after removing the effect of all other features (partial correlation). This plot is useful to identify the features that are contributing most to overfitting. Features that lie below the bisector (diagonal line) performed worse on the test set than on the training set. If the estimator has a <code>scores_</code>, <code>feature_importances_</code> or <code>coef_</code> attribute, its normalized values are shown in a color map.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>columns: int, str, segment, sequence or None, default=None XSelector to plot. If None, it plots all features. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_feature_importance Plot a model's feature importance.</p> <p>plot_partial_dependence Plot the partial dependence of features.</p> <p>plot_permutation_importance Plot the feature permutation importance of models.</p> <p></p>"}, {"location": "API/plots/plot_parshap/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"GNB\", \"RF\"])\n&gt;&gt;&gt; atom.rf.plot_parshap(legend=None)\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_parshap(columns=slice(5, 10))\n</code></pre>"}, {"location": "API/plots/plot_partial_dependence/", "title": "plot_partial_dependence", "text": "<p>method plot_partial_dependence(models=None, columns=(0, 1, 2), kind=\"average\", pair=None, target=1, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the partial dependence of features.</p> <p>The partial dependence of a feature (or a set of features) corresponds to the response of the model for each possible value of the feature. The plot can take two forms:</p> <ul> <li>If <code>pair</code> is None: Single feature partial dependence lines.   The deciles of the feature values are shown with tick marks   on the bottom.</li> <li>If <code>pair</code> is defined: Two-way partial dependence plots are   plotted as contour plots (only allowed for a single model).</li> </ul> <p>Read more about partial dependence on sklearn's documentation. This plot is not available for multilabel nor multiclass-multioutput classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>columns: int, str, segment, sequence, dataframe, default=(0, 1, 2) XSelector to get the partial dependence from. <p>kind: str or sequence, default=\"average\" Kind of dependence to plot. Use a sequence or add <code>+</code> between options to select more than one. Choose from: <ul> <li>\"average\": Partial dependence averaged across all samples   in the dataset.</li> <li>\"individual\": Partial dependence for up to 50 random   samples (Individual Conditional Expectation).</li> </ul> <p>This parameter is ignored when plotting feature pairs.</p> <p>pair: int, str or None, default=None Feature with which to pair the features selected by <code>columns</code>. If specified, the resulting figure displays contour plots. Only allowed when plotting a single model. If None, the plots show the partial dependence of single features. <p>target: int or str, default=1 Class in the target column to look at (only for multiclass classification tasks). <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_feature_importance Plot a model's feature importance.</p> <p>plot_parshap Plot the partial correlation of shap values.</p> <p>plot_permutation_importance Plot the feature permutation importance of models.</p> <p></p>"}, {"location": "API/plots/plot_partial_dependence/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_partial_dependence(kind=\"average+individual\", legend=\"upper left\")\n</code></pre> <pre><code>&gt;&gt;&gt; atom.rf.plot_partial_dependence(columns=(3, 4), pair=2)\n</code></pre>"}, {"location": "API/plots/plot_pca/", "title": "plot_pca", "text": "<p>method plot_pca(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot the explained variance ratio vs number of components.</p> <p>If the underlying estimator is PCA (for dense datasets), all possible components are plotted. If the underlying estimator is TruncatedSVD (for sparse datasets), it only shows the selected components. The star marks the number of components selected by the user. This plot is available only when feature selection was applied with strategy=\"pca\".</p> <p>Parameterstitle: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_components Plot the explained variance ratio per component.</p> <p>plot_rfecv Plot the rfecv results.</p> <p></p>"}, {"location": "API/plots/plot_pca/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.feature_selection(\"pca\", n_features=5)\n&gt;&gt;&gt; atom.plot_pca()\n</code></pre>"}, {"location": "API/plots/plot_permutation_importance/", "title": "plot_permutation_importance", "text": "<p>method plot_permutation_importance(models=None, show=None, n_repeats=10, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the feature permutation importance of models.</p> <p>Warning</p> <p>This method can be slow. Results are cached to fasten repeated calls.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>n_repeats: int, default=10 Number of times to permute each feature. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_feature_importance Plot a model's feature importance.</p> <p>plot_partial_dependence Plot the partial dependence of features.</p> <p>plot_parshap Plot the partial correlation of shap values.</p> <p></p>"}, {"location": "API/plots/plot_permutation_importance/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_permutation_importance(show=10, n_repeats=7)\n</code></pre>"}, {"location": "API/plots/plot_pipeline/", "title": "plot_pipeline", "text": "<p>method plot_pipeline(models=None, draw_hyperparameter_tuning=True, color_branches=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot a diagram of the pipeline.</p> <p>Warning</p> <p>This plot uses the schemdraw package, which is incompatible with plotly. The returned plot is therefore a matplotlib figure.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models for which to draw the pipeline. If None, all pipelines are plotted. <p>draw_hyperparameter_tuning: bool, default=True Whether to draw if the models used Hyperparameter Tuning. <p>color_branches: bool or None, default=None Whether to draw every branch in a different color. If None, branches are colored when there is more than one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the pipeline drawn. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_wordcloud Plot a wordcloud from the corpus.</p> <p></p>"}, {"location": "API/plots/plot_pipeline/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"GNB\", \"RNN\", \"SGD\", \"MLP\"])\n&gt;&gt;&gt; atom.voting(models=atom.winners[:2])\n&gt;&gt;&gt; atom.plot_pipeline()\n</code></pre><pre><code>&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.scale()\n&gt;&gt;&gt; atom.prune()\n&gt;&gt;&gt; atom.run(\"RF\", n_trials=30)\n\n&gt;&gt;&gt; atom.branch = \"undersample\"\n&gt;&gt;&gt; atom.balance(\"nearmiss\")\n&gt;&gt;&gt; atom.run(\"RF_undersample\")\n\n&gt;&gt;&gt; atom.branch = \"oversample_from_main\"\n&gt;&gt;&gt; atom.balance(\"smote\")\n&gt;&gt;&gt; atom.run(\"RF_oversample\")\n\n&gt;&gt;&gt; atom.plot_pipeline()\n</code></pre>"}, {"location": "API/plots/plot_prc/", "title": "plot_prc", "text": "<p>method plot_prc(models=None, rows=\"test\", target=0, title=None, legend=\"lower left\", figsize=(900, 600), filename=None, display=True)[source]Plot the precision-recall curve.</p> <p>Read more about PRC in sklearn's documentation. Only available for binary classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_det Plot the Detection Error Tradeoff curve.</p> <p>plot_lift Plot the lift curve.</p> <p>plot_roc Plot the Receiver Operating Characteristics curve.</p> <p></p>"}, {"location": "API/plots/plot_prc/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_prc()\n</code></pre>"}, {"location": "API/plots/plot_probabilities/", "title": "plot_probabilities", "text": "<p>method plot_probabilities(models=None, rows=\"test\", target=1, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the probability distribution of the target classes.</p> <p>This plot is available only for models with a <code>predict_proba</code> method in classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: hashable, segment or sequence, default=\"test\" Selection of rows on which to calculate the metric. <p>target: int, str or tuple, default=1 Probability of being that class in the target column. For multioutput tasks, the value should be a tuple of the form (column, class). <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_confusion_matrix Plot a model's confusion matrix.</p> <p>plot_results Plot the model results.</p> <p>plot_threshold Plot metric performances against threshold values.</p> <p></p>"}, {"location": "API/plots/plot_probabilities/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_probabilities()\n</code></pre>"}, {"location": "API/plots/plot_qq/", "title": "plot_qq", "text": "<p>method plot_qq(columns=0, distributions=\"norm\", title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot a quantile-quantile plot.</p> <p>Columns are distinguished by color and the distributions are distinguished by marker type. Missing values are ignored.</p> <p>Parameterscolumns: int, str, slice or sequence, default=0 Columns to plot. Selected categorical columns are ignored. <p>distributions: str or sequence, default=\"norm\" Names of the <code>scipy.stats</code> distributions to fit to the columns. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_correlation Plot a correlation matrix.</p> <p>plot_distribution Plot column distributions.</p> <p>plot_relationships Plot pairwise relationships in a dataset.</p> <p></p>"}, {"location": "API/plots/plot_qq/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.plot_qq(columns=[5, 6])\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_qq(columns=0, distributions=[\"norm\", \"invgauss\", \"triang\"])\n</code></pre>"}, {"location": "API/plots/plot_relationships/", "title": "plot_relationships", "text": "<p>method plot_relationships(columns=(0, 1, 2), title=None, legend=None, figsize=(900, 900), filename=None, display=True)[source]Plot pairwise relationships in a dataset.</p> <p>Creates a grid of axes such that each numerical column appears once on the x-axes and once on the y-axes. The bottom triangle contains scatter plots (max 250 random samples), the diagonal plots contain column distributions, and the upper triangle contains contour histograms for all samples in the columns.</p> <p>Parameterscolumns: segment, sequence or dataframe, default=(0, 1, 2) Columns to plot. Selected categorical columns are ignored. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple, default=(900, 900) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_correlation Plot a correlation matrix.</p> <p>plot_distribution Plot column distributions.</p> <p>plot_qq Plot a quantile-quantile plot.</p> <p></p>"}, {"location": "API/plots/plot_relationships/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.plot_relationships(columns=[0, 4, 5])\n</code></pre>"}, {"location": "API/plots/plot_residuals/", "title": "plot_residuals", "text": "<p>method plot_residuals(models=None, rows=\"test\", target=0, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot a model's residuals.</p> <p>The plot shows the residuals (difference between the predicted and the true value) on the vertical axis and the independent variable on the horizontal axis. The gray, intersected line shows the identity line. This plot can be useful to analyze the variance of the regressor's errors. If the points are randomly dispersed around the horizontal axis, a linear regression model is appropriate for the data; otherwise, a non-linear model is more appropriate. This plot is only available for regression tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multioutput tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_errors Plot a model's prediction errors.</p> <p></p>"}, {"location": "API/plots/plot_residuals/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_diabetes\n\n&gt;&gt;&gt; X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y)\n&gt;&gt;&gt; atom.run([\"OLS\", \"LGB\"])\n&gt;&gt;&gt; atom.plot_residuals()\n</code></pre>"}, {"location": "API/plots/plot_results/", "title": "plot_results", "text": "<p>method plot_results(models=None, metric=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the model results.</p> <p>If all models applied bootstrap, the plot is a boxplot. If not, the plot is a barplot. Models are ordered based on their score from the top down. The score is either the <code>[metric]_bootstrap</code> or <code>[metric]_test</code> values, selected in that order.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Other available options are: \"time_bo\", \"time_fit\", \"time_bootstrap\", \"time\". If str, add <code>+</code> between options to select more than one. If None, the metric used to run the pipeline is selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of models. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_confusion_matrix Plot a model's confusion matrix.</p> <p>plot_probabilities Plot the probability distribution of the target classes.</p> <p>plot_threshold Plot metric performances against threshold values.</p> <p></p>"}, {"location": "API/plots/plot_results/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"GNB\", \"LR\", \"RF\", \"LGB\"], metric=[\"f1\", \"recall\"])\n&gt;&gt;&gt; atom.plot_results()\n</code></pre> <pre><code>&gt;&gt;&gt; atom.run([\"GNB\", \"LR\", \"RF\", \"LGB\"], metric=[\"f1\", \"recall\"], n_bootstrap=5)\n&gt;&gt;&gt; atom.plot_results()\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_results(metric=\"time_fit+time\")\n</code></pre>"}, {"location": "API/plots/plot_rfecv/", "title": "plot_rfecv", "text": "<p>method plot_rfecv(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot the rfecv results.</p> <p>Plot the scores obtained by the estimator fitted on every subset of the dataset. Only available when feature selection was applied with strategy=\"rfecv\".</p> <p>Parameterstitle: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_components Plot the explained variance ratio per component.</p> <p>plot_pca Plot the explained variance ratio vs number of components.</p> <p></p>"}, {"location": "API/plots/plot_rfecv/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.feature_selection(\"rfecv\", solver=\"Tree\")\n&gt;&gt;&gt; atom.plot_rfecv()\n</code></pre>"}, {"location": "API/plots/plot_roc/", "title": "plot_roc", "text": "<p>method plot_roc(models=None, rows=\"test\", target=0, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the Receiver Operating Characteristics curve.</p> <p>Read more about ROC in sklearn's documentation. Only available for classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_gains Plot the cumulative gains curve.</p> <p>plot_lift Plot the lift curve.</p> <p>plot_prc Plot the precision-recall curve.</p> <p></p>"}, {"location": "API/plots/plot_roc/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_roc()\n</code></pre>"}, {"location": "API/plots/plot_shap_bar/", "title": "plot_shap_bar", "text": "<p>method plot_shap_bar(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's bar plot.</p> <p>Create a bar plot of a set of SHAP values. If a single sample is passed, then the SHAP values are plotted. If many samples are passed, then the mean absolute value for each feature column is plotted. Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_bar()</code>. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_parshap Plot the partial correlation of shap values.</p> <p>plot_shap_beeswarm Plot SHAP's beeswarm plot.</p> <p>plot_shap_scatter Plot SHAP's scatter plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_bar/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_bar(show=10)\n</code></pre>"}, {"location": "API/plots/plot_shap_beeswarm/", "title": "plot_shap_beeswarm", "text": "<p>method plot_shap_beeswarm(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's beeswarm plot.</p> <p>The plot is colored by feature values. Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_beeswarm()</code>. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. The plot_shap_beeswarm method does not support plotting a single sample. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_parshap Plot the partial correlation of shap values.</p> <p>plot_shap_bar Plot SHAP's bar plot.</p> <p>plot_shap_scatter Plot SHAP's scatter plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_beeswarm/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_beeswarm(show=10)\n</code></pre>"}, {"location": "API/plots/plot_shap_decision/", "title": "plot_shap_decision", "text": "<p>method plot_shap_decision(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's decision plot.</p> <p>Visualize model decisions using cumulative SHAP values. Each plotted line explains a single model prediction. If a single prediction is plotted, feature values are printed in the plot (if supplied). If multiple predictions are plotted together, feature values will not be printed. Plotting too many predictions together will make the plot unintelligible. Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_decision()</code>. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_shap_bar Plot SHAP's bar plot.</p> <p>plot_shap_beeswarm Plot SHAP's beeswarm plot.</p> <p>plot_shap_force Plot SHAP's force plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_decision/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_decision(show=10)\n</code></pre><pre><code>&gt;&gt;&gt; atom.plot_shap_decision(rows=-1, show=10)\n</code></pre>"}, {"location": "API/plots/plot_shap_force/", "title": "plot_shap_force", "text": "<p>method plot_shap_force(models=None, rows=\"test\", target=1, title=None, legend=None, figsize=(900, 300), filename=None, display=True, **kwargs)[source]Plot SHAP's force plot.</p> <p>Visualize the given SHAP values with an additive force layout. Note that by default this plot will render using javascript. For a regular figure use <code>matplotlib=True</code> (this option is only available when only a single sample is plotted). Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_force()</code>. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=(900, 300) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure (only if <code>matplotlib=True</code> in <code>kwargs</code>). <p>**kwargs Additional keyword arguments for shap.plots.force. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_shap_beeswarm Plot SHAP's beeswarm plot.</p> <p>plot_shap_scatter Plot SHAP's scatter plot.</p> <p>plot_shap_decision Plot SHAP's decision plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_force/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_force(rows=-2, matplotlib=True, figsize=(1800, 300))\n</code></pre>"}, {"location": "API/plots/plot_shap_heatmap/", "title": "plot_shap_heatmap", "text": "<p>method plot_shap_heatmap(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's heatmap plot.</p> <p>This plot is designed to show the population substructure of a dataset using supervised clustering and a heatmap. Supervised clustering involves clustering data points not by their original feature values but by their explanations. Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_heatmap()</code>. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. The plot_shap_heatmap method does not support plotting a single sample. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_shap_decision Plot SHAP's decision plot.</p> <p>plot_shap_force Plot SHAP's force plot.</p> <p>plot_shap_waterfall Plot SHAP's waterfall plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_heatmap/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_heatmap(show=10)\n</code></pre>"}, {"location": "API/plots/plot_shap_scatter/", "title": "plot_shap_scatter", "text": "<p>method plot_shap_scatter(models=None, rows=\"test\", columns=0, target=1, title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot SHAP's scatter plot.</p> <p>Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extension of the classical partial dependence plots. Vertical dispersion of the data points represents interaction effects. Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_scatter()</code>. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. The plot_shap_scatter method does not support plotting a single sample. <p>columns: int or str, default=0 Column to plot. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_shap_beeswarm Plot SHAP's beeswarm plot.</p> <p>plot_shap_decision Plot SHAP's decision plot.</p> <p>plot_shap_force Plot SHAP's force plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_scatter/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_scatter(columns=\"symmetry error\")\n</code></pre>"}, {"location": "API/plots/plot_shap_waterfall/", "title": "plot_shap_waterfall", "text": "<p>method plot_shap_waterfall(models=None, rows=0, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's waterfall plot.</p> <p>The SHAP value of a feature represents the impact of the evidence provided by that feature on the model\u2019s output. The waterfall plot is designed to visually display how the SHAP values (evidence) of each feature move the model output from our prior expectation under the background data distribution, to the final model prediction given the evidence of all the features. Features are sorted by the magnitude of their SHAP values with the smallest magnitude features grouped together at the bottom of the plot when the number of features in the models exceeds the <code>show</code> parameter. Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_waterfall()</code>. <p>rows: int or str, default=0 Selection of rows to plot. The plot_shap_waterfall method does not support plotting multiple samples. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_shap_bar Plot SHAP's bar plot.</p> <p>plot_shap_beeswarm Plot SHAP's beeswarm plot.</p> <p>plot_shap_heatmap Plot SHAP's heatmap plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_waterfall/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_waterfall(show=10)\n</code></pre>"}, {"location": "API/plots/plot_slice/", "title": "plot_slice", "text": "<p>method plot_slice(models=None, params=None, metric=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot the parameter relationship in a study.</p> <p>The color of the markers indicates the trial. This plot is only available for models that ran hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_slice()</code>. <p>params: str, segment, sequence or None, default=None Hyperparameters to plot. Use a sequence or add <code>+</code> between options to select more than one. If None, all the model's hyperparameters are selected. <p>metric: int or str, default=None Metric to plot (only for multi-metric runs). If str, add <code>+</code> between options to select more than one. If None, the metric used to run the pipeline is selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_edf Plot the Empirical Distribution Function of a study.</p> <p>plot_hyperparameters Plot hyperparameter relationships in a study.</p> <p>plot_parallel_coordinate Plot high-dimensional parameter relationships in a study.</p> <p></p>"}, {"location": "API/plots/plot_slice/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\n...     models=\"RF\",\n...     metric=[\"f1\", \"recall\"],\n...     n_trials=15,\n... )\n&gt;&gt;&gt; atom.plot_slice(params=(0, 1, 2))\n</code></pre>"}, {"location": "API/plots/plot_successive_halving/", "title": "plot_successive_halving", "text": "<p>method plot_successive_halving(models=None, metric=None, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot scores per iteration of the successive halving.</p> <p>Only use with models fitted using successive halving. Ensembles are ignored.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Use a sequence or add <code>+</code> between options to select more than one. If None, the metric used to run the pipeline is selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_learning_curve Plot the learning curve: score vs number of training samples.</p> <p>plot_results Plot the model results.</p> <p></p>"}, {"location": "API/plots/plot_successive_halving/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.successive_halving([\"Tree\", \"Bag\", \"RF\", \"LGB\"], n_bootstrap=5)\n&gt;&gt;&gt; atom.plot_successive_halving()\n</code></pre>"}, {"location": "API/plots/plot_terminator_improvement/", "title": "plot_terminator_improvement", "text": "<p>method plot_terminator_improvement(models=None, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the potentials for future objective improvement.</p> <p>This function visualizes the objective improvement potentials. It helps to determine whether you should continue the optimization or not. The evaluated error is also plotted. Note that this function may take some time to compute the improvement potentials. This plot is only available for models that ran hyperparameter tuning.</p> <p>Warning</p> <ul> <li>The plot_terminator_improvement method is only available   for models that ran hyperparameter tuning using   cross-validation, e.g., using <code>ht_params={'cv': 5}</code>.</li> <li>This method does not support   [multi-objective optimizations][multi-metric runs].</li> <li>The calculation of the improvement can be slow. Set the   <code>memory</code> parameter to cache the   results and speed up repeated calls.</li> </ul> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y) <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_pareto_front Plot the Pareto front of a study.</p> <p>plot_timeline Plot the timeline of a study.</p> <p>plot_trials Plot the hyperparameter tuning trials.</p> <p></p>"}, {"location": "API/plots/plot_terminator_improvement/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"RF\", n_trials=10, ht_params={\"cv\": 5})\n&gt;&gt;&gt; atom.plot_terminator_improvement()\n</code></pre>"}, {"location": "API/plots/plot_threshold/", "title": "plot_threshold", "text": "<p>method plot_threshold(models=None, metric=None, rows=\"test\", target=0, steps=100, title=None, legend=\"lower left\", figsize=(900, 600), filename=None, display=True)[source]Plot metric performances against threshold values.</p> <p>This plot is available only for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>metric: str, func, scorer, sequence or None, default=None Metric to plot. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred)</code>, a scorer object or a sequence of these. Use a sequence or add <code>+</code> between options to select more than one. If None, the metric used to run the pipeline is selected. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows on which to calculate the metric. <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>steps: int, default=100 Number of thresholds measured. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_calibration Plot the calibration curve for a binary classifier.</p> <p>plot_confusion_matrix Plot a model's confusion matrix.</p> <p>plot_probabilities Plot the probability distribution of the target classes.</p> <p></p>"}, {"location": "API/plots/plot_threshold/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_threshold()\n</code></pre>"}, {"location": "API/plots/plot_timeline/", "title": "plot_timeline", "text": "<p>method plot_timeline(models=None, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the timeline of a study.</p> <p>This plot is only available for models that ran hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y) <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_edf Plot the Empirical Distribution Function of a study.</p> <p>plot_slice Plot the parameter relationship in a study.</p> <p>plot_terminator_improvement Plot the potentials for future objective improvement.</p> <p></p>"}, {"location": "API/plots/plot_timeline/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from optuna.pruners import PatientPruner\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\n...     models=\"LGB\",\n...     n_trials=15,\n...     ht_params={\"pruner\": PatientPruner(None, patience=2)},\n... )\n&gt;&gt;&gt; atom.plot_timeline()\n</code></pre>"}, {"location": "API/plots/plot_trials/", "title": "plot_trials", "text": "<p>method plot_trials(models=None, metric=None, title=None, legend=\"upper left\", figsize=(900, 800), filename=None, display=True)[source]Plot the hyperparameter tuning trials.</p> <p>Creates a figure with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. The best trial is indicated with a star. This is the same plot as produced by <code>ht_params={\"plot\": True}</code>. This plot is only available for models that ran hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected. <p>metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Add <code>+</code> between options to select more than one. If None, all metrics are selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 800) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_evals Plot evaluation curves.</p> <p>plot_hyperparameters Plot hyperparameter relationships in a study.</p> <p>plot_results Plot the model results.</p> <p></p>"}, {"location": "API/plots/plot_trials/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"ET\", \"RF\"], n_trials=15)\n&gt;&gt;&gt; atom.plot_trials()\n</code></pre>"}, {"location": "API/plots/plot_wordcloud/", "title": "plot_wordcloud", "text": "<p>method plot_wordcloud(rows=\"dataset\", title=None, legend=None, figsize=(900, 600), filename=None, display=True, **kwargs)[source]Plot a wordcloud from the corpus.</p> <p>The text for the plot is extracted from the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows in the corpus to include in the wordcloud. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>**kwargs Additional keyword arguments for the Wordcloud object. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_ngrams Plot n-gram frequencies.</p> <p>plot_pipeline Plot a diagram of the pipeline.</p> <p></p>"}, {"location": "API/plots/plot_wordcloud/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import fetch_20newsgroups\n\n&gt;&gt;&gt; X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n&gt;&gt;&gt; X = np.array(X).reshape(-1, 1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.textclean()\n&gt;&gt;&gt; atom.textnormalize()\n&gt;&gt;&gt; atom.plot_wordcloud()\n</code></pre>"}, {"location": "API/training/directclassifier/", "title": "DirectClassifier", "text": "<p>class atom.training.DirectClassifier(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a direct fashion.</p> <p>The following steps are applied to every model:</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, a default metric is selected for every task: <ul> <li>\"f1\" for binary classification</li> <li>\"f1_weighted\" for multiclass(-multioutput) classification</li> <li>\"average_precision\" for multilabel classification</li> </ul> <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMClassifier Main class for classification tasks.</p> <p>SuccessiveHalvingClassifier Train and evaluate the models in a successive halving fashion.</p> <p>TrainSizingClassifier Train and evaluate the models in a train sizing fashion.</p> <p></p>"}, {"location": "API/training/directclassifier/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import DirectClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n&gt;&gt;&gt; from sklearn.model_selection import train_test_split\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n&gt;&gt;&gt; runner = DirectClassifier(models=[\"LR\", \"RF\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nModels: LR, RF\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.992\nTest evaluation --&gt; f1: 0.9767\nTime elapsed: 0.104s\n-------------------------------------------------\nTime: 0.104s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.968\nTime elapsed: 0.204s\n-------------------------------------------------\nTime: 0.204s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.314s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9767 !\nRandomForest       --&gt; f1: 0.968\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n    f1_train  f1_test  time_fit      time\nLR     0.992   0.9767  0.104497  0.104497\nRF     1.000   0.9680  0.204185  0.204185\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n    accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR    0.9708  0.9976  0.9702  0.9767   0.9545  0.9374     0.9813  0.9722  0.9959\nRF    0.9591  0.9490  0.9511  0.9680   0.9381  0.9118     0.9550  0.9815  0.9511\n</code></pre>"}, {"location": "API/training/directclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/directclassifier/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/directclassifier/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/directclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/directclassifier/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/directclassifier/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/directforecaster/", "title": "DirectForecaster", "text": "<p>class atom.training.DirectForecaster(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a direct fashion.</p> <p>The following steps are applied to every model:</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, the default metric <code>mean_absolute_percentage_error</code> is selected. <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMForecaster Main class for forecasting tasks.</p> <p>SuccessiveHalvingForecaster Train and evaluate the models in a successive halving fashion.</p> <p>TrainSizingForecaster Train and evaluate the models in a train sizing fashion.</p> <p></p>"}, {"location": "API/training/directforecaster/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import DirectForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n&gt;&gt;&gt; from sktime.split import temporal_train_test_split\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; train, test = temporal_train_test_split(y, test_size=0.2)\n\n&gt;&gt;&gt; runner = DirectForecaster(models=[\"ES\", \"ETS\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nModels: ES, ETS\nMetric: mape\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0868\nTest evaluation --&gt; mape: -0.2018\nTime elapsed: 0.019s\n-------------------------------------------------\nTime: 0.019s\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0863\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.041s\n-------------------------------------\nExponentialSmoothing --&gt; mape: -0.2018 !\nETS                  --&gt; mape: -0.202\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n     mape_train  mape_test  time_fit      time\nES      -0.0868    -0.2018  0.019017  0.019017\nETS     -0.0863    -0.2020  0.020018  0.020018\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n         mae    mape        mse      r2     rmse\nES  -81.3862 -0.2018 -8661.7730 -0.4189 -93.0686\nETS -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\n</code></pre>"}, {"location": "API/training/directforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/directforecaster/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/directforecaster/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/directforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/directforecaster/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/directforecaster/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/directregressor/", "title": "DirectRegressor", "text": "<p>class atom.training.DirectRegressor(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a direct fashion.</p> <p>The following steps are applied to every model:</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, the default metric <code>r2</code> is selected. <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMRegressor Main class for regression tasks.</p> <p>SuccessiveHalvingRegressor Train and evaluate the models in a successive halving fashion.</p> <p>TrainSizingRegressor Train and evaluate the models in a train sizing fashion.</p> <p></p>"}, {"location": "API/training/directregressor/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import DirectRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_digits\n&gt;&gt;&gt; from sklearn.model_selection import train_test_split\n\n&gt;&gt;&gt; X, y = load_digits(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n&gt;&gt;&gt; runner = DirectRegressor(models=[\"OLS\", \"RF\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nModels: OLS, RF\nMetric: r2\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.5991\nTest evaluation --&gt; r2: 0.5765\nTime elapsed: 0.154s\n-------------------------------------------------\nTime: 0.154s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.9803\nTest evaluation --&gt; r2: 0.8803\nTime elapsed: 1.594s\n-------------------------------------------------\nTime: 1.594s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.749s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: 0.5765\nRandomForest         --&gt; r2: 0.8803 !\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n     r2_train  r2_test  time_fit      time\nOLS    0.5991   0.5765  0.153989  0.153989\nRF     0.9803   0.8803  1.594449  1.594449\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n        mae          mape     mse      r2    rmse\nOLS -1.4553 -9.184808e+14 -3.4564  0.5765 -1.8591\nRF  -0.6098 -2.854782e+14 -0.9773  0.8803 -0.9886\n</code></pre>"}, {"location": "API/training/directregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/directregressor/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/directregressor/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/directregressor/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/directregressor/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/directregressor/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/successivehalvingclassifier/", "title": "SuccessiveHalvingClassifier", "text": "<p>class atom.training.SuccessiveHalvingClassifier(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a successive halving fashion.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, a default metric is selected for every task: <ul> <li>\"f1\" for binary classification</li> <li>\"f1_weighted\" for multiclass(-multioutput) classification</li> <li>\"average_precision\" for multilabel classification</li> </ul> <p>skip_runs: int, default=0 Skip last <code>skip_runs</code> runs of the successive halving. <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMClassifier Main class for classification tasks.</p> <p>DirectClassifier Train and evaluate the models in a direct fashion.</p> <p>TrainSizingClassifier Train and evaluate the models in a train sizing fashion.</p> <p></p>"}, {"location": "API/training/successivehalvingclassifier/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import SuccessiveHalvingClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n&gt;&gt;&gt; from sklearn.model_selection import train_test_split\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n&gt;&gt;&gt; runner = SuccessiveHalvingClassifier([\"LR\", \"RF\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nMetric: f1\n\n\nRun: 0 =========================== &gt;&gt;\nModels: LR2, RF2\nSize of training set: 398 (50%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.996\nTest evaluation --&gt; f1: 0.9677\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9444\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.228s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9677 !\nRandomForest       --&gt; f1: 0.9444\n\n\nRun: 1 =========================== &gt;&gt;\nModels: LR1\nSize of training set: 398 (100%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.994\nTest evaluation --&gt; f1: 0.9818\nTime elapsed: 0.095s\n-------------------------------------------------\nTime: 0.095s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.098s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9818\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n            f1_train  f1_test  time_fit      time\nfrac model                                       \n0.5  LR2       0.996   0.9677  0.086078  0.086078\n     RF2       1.000   0.9444  0.137125  0.137125\n1.0  LR1       0.994   0.9818  0.094800  0.094800\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n     accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR2    0.9591  0.9963  0.9609  0.9677   0.9375  0.9124     0.9813  0.9545  0.9937\nRF2    0.9298  0.9391  0.9308  0.9444   0.8947  0.8504     0.9623  0.9273  0.9308\nLR1    0.9766  0.9972  0.9745  0.9818   0.9643  0.9490     0.9818  0.9818  0.9952\n</code></pre>"}, {"location": "API/training/successivehalvingclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/successivehalvingclassifier/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/successivehalvingclassifier/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/successivehalvingclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/successivehalvingclassifier/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/successivehalvingclassifier/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/successivehalvingforecaster/", "title": "SuccessiveHalvingForecaster", "text": "<p>class atom.training.SuccessiveHalvingForecaster(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a successive halving fashion.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, the default metric <code>mean_absolute_percentage_error</code> is selected. <p>skip_runs: int, default=0 Skip last <code>skip_runs</code> runs of the successive halving. <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMForecaster Main class for forecasting tasks.</p> <p>DirectForecaster Train and evaluate the models in a direct fashion.</p> <p>TrainSizingForecaster Train and evaluate the models in a train sizing fashion.</p> <p></p>"}, {"location": "API/training/successivehalvingforecaster/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import SuccessiveHalvingForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n&gt;&gt;&gt; from sktime.split import temporal_train_test_split\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; train, test = temporal_train_test_split(y, test_size=0.2)\n\n&gt;&gt;&gt; runner = SuccessiveHalvingForecaster([\"ETS\", \"ES\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nMetric: mape\n\n\nRun: 0 =========================== &gt;&gt;\nModels: ETS2, ES2\nSize of training set: 115 (50%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0879\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0879\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.017s\n-------------------------------------------------\nTime: 0.017s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.039s\n-------------------------------------\nETS                  --&gt; mape: -0.202 !\nExponentialSmoothing --&gt; mape: -0.202 !\n\n\nRun: 1 =========================== &gt;&gt;\nModels: ETS1\nSize of training set: 115 (100%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0863\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.021s\n-------------------------------------\nETS --&gt; mape: -0.202\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n            mape_train  mape_test  time_fit      time\nfrac model                                           \n0.5  ES2       -0.0879     -0.202  0.017015  0.017015\n     ETS2      -0.0879     -0.202  0.020018  0.020018\n1.0  ETS1      -0.0863     -0.202  0.020018  0.020018\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n          mae   mape        mse      r2     rmse\nETS2 -81.4454 -0.202 -8673.3633 -0.4208 -93.1309\nES2  -81.4483 -0.202 -8673.9309 -0.4209 -93.1339\nETS1 -81.4454 -0.202 -8673.3633 -0.4208 -93.1309\n</code></pre>"}, {"location": "API/training/successivehalvingforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/successivehalvingforecaster/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/successivehalvingforecaster/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/successivehalvingforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/successivehalvingforecaster/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/successivehalvingforecaster/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/successivehalvingregressor/", "title": "SuccessiveHalvingRegressor", "text": "<p>class atom.training.SuccessiveHalvingRegressor(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a successive halving fashion.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, the default metric <code>r2</code> is selected. <p>skip_runs: int, default=0 Skip last <code>skip_runs</code> runs of the successive halving. <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMRegressor Main class for regression tasks.</p> <p>DirectRegressor Train and evaluate the models in a direct fashion.</p> <p>TrainSizingRegressor Train and evaluate the models in a train sizing fashion.</p> <p></p>"}, {"location": "API/training/successivehalvingregressor/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import SuccessiveHalvingRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_digits\n&gt;&gt;&gt; from sklearn.model_selection import train_test_split\n\n&gt;&gt;&gt; X, y = load_digits(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n&gt;&gt;&gt; runner = SuccessiveHalvingRegressor([\"OLS\", \"RF\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nMetric: r2\n\n\nRun: 0 =========================== &gt;&gt;\nModels: OLS2, RF2\nSize of training set: 1257 (50%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6083\nTest evaluation --&gt; r2: -2.168057727555873e+23\nTime elapsed: 0.146s\n-------------------------------------------------\nTime: 0.146s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.9685\nTest evaluation --&gt; r2: 0.7924\nTime elapsed: 0.913s\n-------------------------------------------------\nTime: 0.913s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.061s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: -2.168057727555873e+23 ~\nRandomForest         --&gt; r2: 0.7924 !\n\n\nRun: 1 =========================== &gt;&gt;\nModels: RF1\nSize of training set: 1257 (100%)\nSize of test set: 540\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.9802\nTest evaluation --&gt; r2: 0.8692\nTime elapsed: 1.571s\n-------------------------------------------------\nTime: 1.571s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.573s\n-------------------------------------\nRandomForest --&gt; r2: 0.8692\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n            r2_train       r2_test  time_fit      time\nfrac model                                            \n0.5  OLS2     0.6083 -2.168058e+23  0.146151  0.146151\n     RF2      0.9685  7.924000e-01  0.912829  0.912829\n1.0  RF1      0.9802  8.692000e-01  1.571428  1.571428\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n               mae          mape           mse            r2          rmse\nOLS2 -1.375810e+11 -6.979478e+14 -1.715067e+24 -2.168058e+23 -1.309606e+12\nRF2  -8.656000e-01 -3.503634e+14 -1.642300e+00  7.924000e-01 -1.281500e+00\nRF1  -6.385000e-01 -1.768080e+14 -1.034400e+00  8.692000e-01 -1.017000e+00\n</code></pre>"}, {"location": "API/training/successivehalvingregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/successivehalvingregressor/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/successivehalvingregressor/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/successivehalvingregressor/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/successivehalvingregressor/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/successivehalvingregressor/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/trainsizingclassifier/", "title": "TrainSizingClassifier", "text": "<p>class atom.training.TrainSizingClassifier(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a train sizing fashion.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, a default metric is selected for every task: <ul> <li>\"f1\" for binary classification</li> <li>\"f1_weighted\" for multiclass(-multioutput) classification</li> <li>\"average_precision\" for multilabel classification</li> </ul> <p>train_sizes: int or sequence, default=5 Training set sizes used to run the trainings. <ul> <li>If int: Number of equally distributed splits, i.e., for a   value <code>N</code>, it's equal to <code>np.linspace(1.0/N, 1.0, N)</code>.</li> <li>If sequence: Fraction of the training set when &lt;=1, else   total number of samples.</li> </ul> <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMRegressor Main class for regression tasks.</p> <p>DirectRegressor Train and evaluate the models in a direct fashion.</p> <p>SuccessiveHalvingRegressor Train and evaluate the models in a successive halving fashion.</p> <p></p>"}, {"location": "API/training/trainsizingclassifier/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import TrainSizingClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n&gt;&gt;&gt; from sklearn.model_selection import train_test_split\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n&gt;&gt;&gt; runner = TrainSizingClassifier(models=\"LR\", verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nMetric: f1\n\n\nRun: 0 =========================== &gt;&gt;\nModels: LR02\nSize of training set: 79 (20%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9899\nTest evaluation --&gt; f1: 0.9455\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.089s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9455\n\n\nRun: 1 =========================== &gt;&gt;\nModels: LR04\nSize of training set: 159 (40%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9898\nTest evaluation --&gt; f1: 0.9727\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.088s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9727\n\n\nRun: 2 =========================== &gt;&gt;\nModels: LR06\nSize of training set: 238 (60%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9936\nTest evaluation --&gt; f1: 0.9683\nTime elapsed: 0.085s\n-------------------------------------------------\nTime: 0.085s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.088s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9683\n\n\nRun: 3 =========================== &gt;&gt;\nModels: LR08\nSize of training set: 318 (80%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9901\nTest evaluation --&gt; f1: 0.9817\nTime elapsed: 0.096s\n-------------------------------------------------\nTime: 0.096s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.099s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9817\n\n\nRun: 4 =========================== &gt;&gt;\nModels: LR10\nSize of training set: 398 (100%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.992\nTest evaluation --&gt; f1: 0.9772\nTime elapsed: 0.099s\n-------------------------------------------------\nTime: 0.099s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.102s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9772\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n            f1_train  f1_test  time_fit      time\nfrac model                                       \n0.2  LR02     0.9899   0.9455  0.086078  0.086078\n0.4  LR04     0.9898   0.9727  0.086078  0.086078\n0.6  LR06     0.9936   0.9683  0.085077  0.085077\n0.8  LR08     0.9901   0.9817  0.095865  0.095865\n1.0  LR10     0.9920   0.9772  0.098852  0.098852\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n      accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR02    0.9298  0.9916  0.9180  0.9455   0.8966  0.8483     0.9286  0.9630  0.9857\nLR04    0.9649  0.9971  0.9557  0.9727   0.9469  0.9248     0.9554  0.9907  0.9950\nLR06    0.9591  0.9976  0.9478  0.9683   0.9386  0.9124     0.9469  0.9907  0.9959\nLR08    0.9766  0.9963  0.9716  0.9817   0.9640  0.9497     0.9727  0.9907  0.9938\nLR10    0.9708  0.9973  0.9636  0.9772   0.9554  0.9372     0.9640  0.9907  0.9954\n</code></pre>"}, {"location": "API/training/trainsizingclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/trainsizingclassifier/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/trainsizingclassifier/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/trainsizingclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/trainsizingclassifier/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/trainsizingclassifier/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/trainsizingforecaster/", "title": "TrainSizingForecaster", "text": "<p>class atom.training.TrainSizingForecaster(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a train sizing fashion.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, the default metric <code>mean_absolute_percentage_error</code> is selected. <p>train_sizes: int or sequence, default=5 Training set sizes used to run the trainings. <ul> <li>If int: Number of equally distributed splits, i.e., for a   value <code>N</code>, it's equal to <code>np.linspace(1.0/N, 1.0, N)</code>.</li> <li>If sequence: Fraction of the training set when &lt;=1, else   total number of samples.</li> </ul> <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMForecaster Main class for forecasting tasks.</p> <p>DirectForecaster Train and evaluate the models in a direct fashion.</p> <p>SuccessiveHalvingForecaster Train and evaluate the models in a successive halving fashion.</p> <p></p>"}, {"location": "API/training/trainsizingforecaster/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import TrainSizingForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n&gt;&gt;&gt; from sktime.split import temporal_train_test_split\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; train, test = temporal_train_test_split(y, test_size=0.2)\n\n&gt;&gt;&gt; runner = TrainSizingForecaster([\"ETS\", \"ES\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nMetric: mape\n\n\nRun: 0 =========================== &gt;&gt;\nModels: ETS02, ES02\nSize of training set: 23 (20%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0889\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0889\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.017s\n-------------------------------------------------\nTime: 0.017s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.041s\n-------------------------------------\nETS                  --&gt; mape: -0.202 !\nExponentialSmoothing --&gt; mape: -0.202 !\n\n\nRun: 1 =========================== &gt;&gt;\nModels: ETS04, ES04\nSize of training set: 46 (40%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0871\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.019s\n-------------------------------------------------\nTime: 0.019s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0871\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.039s\n-------------------------------------\nETS                  --&gt; mape: -0.202 !\nExponentialSmoothing --&gt; mape: -0.202 !\n\n\nRun: 2 =========================== &gt;&gt;\nModels: ETS06, ES06\nSize of training set: 69 (60%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0861\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0867\nTest evaluation --&gt; mape: -0.2016\nTime elapsed: 0.017s\n-------------------------------------------------\nTime: 0.017s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.038s\n-------------------------------------\nETS                  --&gt; mape: -0.202\nExponentialSmoothing --&gt; mape: -0.2016 !\n\n\nRun: 3 =========================== &gt;&gt;\nModels: ETS08, ES08\nSize of training set: 92 (80%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0842\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0845\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.040s\n-------------------------------------\nETS                  --&gt; mape: -0.202 !\nExponentialSmoothing --&gt; mape: -0.202 !\n\n\nRun: 4 =========================== &gt;&gt;\nModels: ETS10, ES10\nSize of training set: 115 (100%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0863\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0868\nTest evaluation --&gt; mape: -0.2018\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.040s\n-------------------------------------\nETS                  --&gt; mape: -0.202\nExponentialSmoothing --&gt; mape: -0.2018 !\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n            mape_train  mape_test  time_fit      time\nfrac model                                           \n0.2  ES02      -0.0889    -0.2020  0.017015  0.017015\n     ETS02     -0.0889    -0.2020  0.021020  0.021020\n0.4  ES04      -0.0871    -0.2020  0.018016  0.018016\n     ETS04     -0.0871    -0.2020  0.019017  0.019017\n0.6  ES06      -0.0867    -0.2016  0.017015  0.017015\n     ETS06     -0.0861    -0.2020  0.020019  0.020019\n0.8  ES08      -0.0845    -0.2020  0.018016  0.018016\n     ETS08     -0.0842    -0.2020  0.020018  0.020018\n1.0  ES10      -0.0868    -0.2018  0.018016  0.018016\n     ETS10     -0.0863    -0.2020  0.020018  0.020018\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n           mae    mape        mse      r2     rmse\nETS02 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES02  -81.4444 -0.2020 -8673.1766 -0.4208 -93.1299\nETS04 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES04  -81.4483 -0.2020 -8673.9309 -0.4209 -93.1339\nETS06 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES06  -81.3025 -0.2016 -8645.4416 -0.4162 -92.9809\nETS08 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES08  -81.4483 -0.2020 -8673.9309 -0.4209 -93.1339\nETS10 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES10  -81.3862 -0.2018 -8661.7730 -0.4189 -93.0686\n</code></pre>"}, {"location": "API/training/trainsizingforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/trainsizingforecaster/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/trainsizingforecaster/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/trainsizingforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/trainsizingforecaster/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/trainsizingforecaster/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/trainsizingregressor/", "title": "TrainSizingRegressor", "text": "<p>class atom.training.TrainSizingRegressor(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a train sizing fashion.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, the default metric <code>r2</code> is selected. <p>train_sizes: int or sequence, default=5 Training set sizes used to run the trainings. <ul> <li>If int: Number of equally distributed splits, i.e., for a   value <code>N</code>, it's equal to <code>np.linspace(1.0/N, 1.0, N)</code>.</li> <li>If sequence: Fraction of the training set when &lt;=1, else   total number of samples.</li> </ul> <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMRegressor Main class for regression tasks.</p> <p>DirectRegressor Train and evaluate the models in a direct fashion.</p> <p>SuccessiveHalvingRegressor Train and evaluate the models in a successive halving fashion.</p> <p></p>"}, {"location": "API/training/trainsizingregressor/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import TrainSizingRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_digits\n&gt;&gt;&gt; from sklearn.model_selection import train_test_split\n\n&gt;&gt;&gt; X, y = load_digits(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n&gt;&gt;&gt; runner = TrainSizingRegressor(models=\"OLS\", verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nMetric: r2\n\n\nRun: 0 =========================== &gt;&gt;\nModels: OLS02\nSize of training set: 251 (20%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6391\nTest evaluation --&gt; r2: -4.630208907041091e+25\nTime elapsed: 0.148s\n-------------------------------------------------\nTime: 0.148s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.149s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: -4.630208907041091e+25 ~\n\n\nRun: 1 =========================== &gt;&gt;\nModels: OLS04\nSize of training set: 502 (40%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6137\nTest evaluation --&gt; r2: -9.496101715653298e+22\nTime elapsed: 0.150s\n-------------------------------------------------\nTime: 0.150s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.151s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: -9.496101715653298e+22 ~\n\n\nRun: 2 =========================== &gt;&gt;\nModels: OLS06\nSize of training set: 754 (60%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6086\nTest evaluation --&gt; r2: -0.2872\nTime elapsed: 0.151s\n-------------------------------------------------\nTime: 0.151s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.152s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: -0.2872 ~\n\n\nRun: 3 =========================== &gt;&gt;\nModels: OLS08\nSize of training set: 1005 (80%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.5986\nTest evaluation --&gt; r2: 0.5025\nTime elapsed: 0.150s\n-------------------------------------------------\nTime: 0.150s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.152s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: 0.5025\n\n\nRun: 4 =========================== &gt;&gt;\nModels: OLS10\nSize of training set: 1257 (100%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.5951\nTest evaluation --&gt; r2: 0.5864\nTime elapsed: 0.150s\n-------------------------------------------------\nTime: 0.150s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.151s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: 0.5864\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n            r2_train       r2_test  time_fit      time\nfrac model                                            \n0.2  OLS02    0.6391 -4.630209e+25  0.148360  0.148360\n0.4  OLS04    0.6137 -9.496102e+22  0.149996  0.149996\n0.6  OLS06    0.6086 -2.872000e-01  0.151353  0.151353\n0.8  OLS08    0.5986  5.025000e-01  0.149508  0.149508\n1.0  OLS10    0.5951  5.864000e-01  0.149549  0.149549\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n                mae          mape           mse            r2          rmse\nOLS02 -1.004380e+12 -7.646687e+14 -3.774343e+26 -4.630209e+25 -1.942767e+13\nOLS04 -5.120843e+10 -8.663629e+14 -7.740805e+23 -9.496102e+22 -8.798184e+11\nOLS06 -1.559600e+00 -7.836450e+14 -1.049240e+01 -2.872000e-01 -3.239200e+00\nOLS08 -1.482200e+00 -8.382465e+14 -4.055100e+00  5.025000e-01 -2.013700e+00\nOLS10 -1.445900e+00 -8.224099e+14 -3.371700e+00  5.864000e-01 -1.836200e+00\n</code></pre>"}, {"location": "API/training/trainsizingregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/trainsizingregressor/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/trainsizingregressor/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/trainsizingregressor/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/trainsizingregressor/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/trainsizingregressor/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "changelog/v4.x.x/", "title": "Release history", "text": ""}, {"location": "changelog/v4.x.x/#version-4141", "title": "Version 4.14.1", "text": "<ul> <li>Fixed an installation issue with <code>conda</code>.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-4140", "title": "Version 4.14.0", "text": "<ul> <li>Refactor of the Cleaner and Vectorizer classes.</li> <li>Refactor of the cross_validate method.</li> <li>The plot_pipeline method now supports drawing multiple pipelines.</li> <li>Renamed the <code>Normalizer</code> class to <code>TextNormalizer</code>.</li> <li>Renamed the <code>Gauss</code> class to <code>Normalizer</code>.</li> <li>Added the <code>inverse_transform</code> method to the Scaler, Normalizer   and Cleaner classes.</li> <li>Added the <code>winners</code> property to the trainers (note the extra <code>s</code>). </li> <li>Added the <code>feature_names_in_</code> and <code>n_features_in_</code> attributes to transformers.</li> <li>The default value of the <code>warnings</code> parameter is set to False.</li> <li>Improvements for multicollinearity removal in FeatureSelector.</li> <li>Renamed default feature names to <code>x0</code>, <code>x1</code>, etc... for consistency with   sklearn's API.</li> <li>Renamed component names in FeatureSelector   to <code>pca0</code>, <code>pca1</code>, etc... for consistency with sklearn's API.</li> <li>Significant speed up in pipeline transformations.</li> <li>Fixed a bug where mlflow runs could be ended unexpectedly.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-4131", "title": "Version 4.13.1", "text": "<ul> <li>Fixed an installation issue.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-4130", "title": "Version 4.13.0", "text": "<ul> <li>Added GPU support. Read more in the user guide.</li> <li>Added advanced feature selection strategies.</li> <li>Added the <code>return_sparse</code> parameter to the Vectorizer class.</li> <li>Added the <code>quantile</code> hyperparameter to the Dummy model.</li> <li>The data attributes now return pandas objects where possible.</li> <li>Fixed a bug where the BO could crash after balancing   the data.</li> <li>Fixed a bug where saving the FeatureGenerator   class could fail for certain operators.</li> <li>Fixed a bug where the FeatureSelector   class displayed the wrong output.</li> <li>Fixed a bug where the <code>mapping</code> attribute was not reordered.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-4120", "title": "Version 4.12.0", "text": "<ul> <li>Support for Python 3.10.</li> <li>New Discretizer class to bin numerical features.</li> <li>Refactor of the FeatureGenerator class.</li> <li>The <code>mapping</code> attribute now shows all encoded features.</li> <li>Added the <code>sample_weight</code> parameter to the evaluate method.</li> <li>ATOMClassifier has now a <code>stratify</code> parameter   to split the data sets in a stratified fashion.</li> <li>Possibility to exclude hyperparameters from the BO adding <code>!</code> before the name.</li> <li>Added memory usage to the stats method.</li> <li>Fixed a bug where plot_shap_decision could fail when only one row was plotted.</li> <li>Added versioning to the documentation.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-4110", "title": "Version 4.11.0", "text": "<ul> <li>Full support for sparse matrices. Read more in the user guide.</li> <li>The shrink method now also handles sparse features.</li> <li>Refactor of the distribution method.</li> <li>Added three new linear models: Lars, Huber and Perc.</li> <li>Dimensions can be shared across models using the key 'all' in <code>ht_params[\"dimensions\"]</code>.</li> <li>Assign hyperparameters to tune using the predefined dimensions.</li> <li>It's now possible to tune a custom number of layers for the MLP   model.</li> <li>If multiple BO calls share the best score, the one with the shortest   training time is selected as winner (instead of the first).</li> <li>Fixed a bug where the BO could fail when custom dimensions where defined.</li> <li>Fixed a bug where FeatureSelector   could fail after repeated calls to fit.</li> <li>Fixed a bug where FeatureGenerator   didn't pass the correct data indices to its output.</li> <li>Performance improvements for the custom pipeline.</li> <li>Minor documentation fixes.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-4100", "title": "Version 4.10.0", "text": "<ul> <li>Added the <code>holdout</code> data set to have an extra way of assessing a   model's performance on a completely independent dataset. Read more   in the user_guide.</li> <li>Complete rework of the ensemble models.</li> <li>Support for dataframe indexing. Read more in the user guide.</li> <li>New plot_parshap plot to detect overfitting   features.</li> <li>The new create_dashboard method makes analyzing   the models even easier using a dashboard app.</li> <li>The plot_feature_importance   plot now also accepts estimators with coefficients.</li> <li>Added the transform method for models.</li> <li>Added the <code>threshold</code> parameter to the evaluate method.</li> <li>The <code>reset_predictions</code> method is deprecated in favour of the new   clear method.</li> <li>Refactor of the model's full_train method.</li> <li>The merge method is available for all trainers.</li> <li>Improvements in the trainer's pipeline.</li> <li>Training scores are now also saved to the mlflow run.</li> <li>Trying to change the data in a branch after fitting a model with it now   raises an exception.</li> <li>Fixed a bug where the columns of array inputs were not ordered correctly.</li> <li>Fixed a bug where branches did not correctly act case-insensitive.</li> <li>Fixed a bug where the export_pipeline   method for models would not export the transformers in the correct branch.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-491", "title": "Version 4.9.1", "text": "<ul> <li>Changed the default cross-validation for hyperparameter tuning   from 5 to 1 to avoid errors with deep learning models.</li> <li>Added clearer exception messages when a model's run failed.</li> <li>Fixed a bug where custom dimensions didn't show during   hyperparameter tuning.</li> <li>Documentation improvements.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-490", "title": "Version 4.9.0", "text": "<ul> <li>Drop support of Python 3.6.</li> <li>Added the HistGBM model.</li> <li>Improved print layout for hyperparameter tuning.</li> <li>The new available_models method returns an overview of   the available predefined models.</li> <li>The calibrate and cross_validate   methods can no longer be accessed from the trainers.</li> <li>The <code>pipeline</code> parameter for the prediction methods is deprecated.</li> <li>Improved visualization of the plot_rfecv, plot_successive_halving and    plot_learning_curve methods.</li> <li>Sparse matrices are now accepted as input.</li> <li>Duplicate BO calls are no longer calculated.</li> <li>Improvement in performance of the RNN model.</li> <li>Refactor of the model's <code>bo</code> attribute.</li> <li>Predefined hyperparameters have been updated to be consistent with sklearn's API.</li> <li>Fixed a bug where custom scalers were ignored by the models.</li> <li>Fixed a bug where the BO of certain models would crash with custom hyperparameters.</li> <li>Fixed a bug where duplicate column names could be generated from a custom transformer.</li> <li>Documentation improvements.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-480", "title": "Version 4.8.0", "text": "<ul> <li>The Encoder class now directly handles   unknown categories encountered during fitting.</li> <li>The Balancerand Encoder classes now accept custom estimators for   the <code>strategy</code> parameter.</li> <li>The new merge method enables the user to merge   multiple atom instances into one.</li> <li>The dtype shrinking is moved from atom's initializers to the   shrink method.</li> <li>ATOM's custom pipeline now handles transformers fitted on a   subset of the dataset.</li> <li>The <code>column</code> parameter in the distribution   method is renamed to <code>columns</code> for continuity of the API.</li> <li>The <code>mae</code> criterion for the GBM model hyperparameter tuning is deprecated   to be consistent with sklearn's API.</li> <li>Branches are now case-insensitive.</li> <li>Renaming a branch using an existing name now raises an exception.</li> <li>Fixed a bug where columns of type <code>category</code> broke the Imputer class.</li> <li>Fixed a bug where predictions of the Stacking ensemble crashed for   branches with multiple transformers.</li> <li>The tables in the documentation now adapt to dark mode.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-473", "title": "Version 4.7.3", "text": "<ul> <li>Fixed a bug where the conda-forge recipe couldn't install properly.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-472", "title": "Version 4.7.2", "text": "<ul> <li>Fixed a bug where the pipeline failed for custom transformers that   returned sparse matrices.</li> <li>Package requirements files are added to the installer.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-471", "title": "Version 4.7.1", "text": "<ul> <li>Fixed a bug where the pip installer failed.</li> <li>Fixed a bug where categorical columns also selected datetime columns.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-470", "title": "Version 4.7.0", "text": "<ul> <li>Launched our new slack channel!</li> <li>The new FeatureExtractor class extracts useful features from datetime columns.</li> <li>The new plot_det method plots a binary classifier's detection error tradeoff curve. </li> <li>The plot_partial_dependence is able to draw Individual Conditional Expectation (ICE) lines.</li> <li>The full traceback of exceptions encountered during training are now   saved to the logger.</li> <li>ATOMClassifier and ATOMRegressor now convert the dtypes of the input   data to the minimal allowed type for memory efficiency.</li> <li>The scoring method is renamed to evaluate to clarify its purpose.</li> <li>The <code>column</code> parameter in the apply method   is renamed to <code>columns</code> for continuity of the API.</li> <li>Minor documentation improvements.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-460", "title": "Version 4.6.0", "text": "<ul> <li>Added the full_train method to retrieve an estimator trained   on the complete dataset.</li> <li>The score method is now also able to calculate custom metrics on new data.</li> <li>Refactor of the Imputer class. </li> <li>Refactor of the Encoder class to avoid errors for unknown classes and allow   the input of missing values.</li> <li>The clean method no longer automatically   encodes the target column for regression tasks.</li> <li>Creating a branch using a models' acronym as name now raises an exception.</li> <li>Fixed a bug where CatBoost failed when <code>early_stopping</code> &lt; 1.</li> <li>Fixed a bug where created pipelines had duplicated names.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-450", "title": "Version 4.5.0", "text": "<ul> <li>Support of NLP pipelines. Read more in the user guide.</li> <li>Integration of mlflow to track all models in the   pipeline. Read more in the user guide.</li> <li>The new Normalizer class transforms features to a more Gaussian-like distribution.</li> <li>New cross_validate method to evaluate the robustness   of a pipeline using cross_validation.</li> <li>New reset method to go back to atom's initial state.</li> <li>Added the Dummy model to compare other models with a simple baseline.</li> <li>New plot_wordcloud and plot_ngrams methods for text visualization.</li> <li>Plots now can return the figure object when <code>display=None</code>.</li> <li>The Pruner class can now able to drop outliers   based on the selection of multiple strategies.</li> <li>The new <code>shuffle</code> parameter in atom's initializer determines whether to   shuffle the dataset.</li> <li>The trainers no longer require you to specify a model using the <code>models</code>   parameter. If left to default, all predefined models for that task are used.</li> <li>The apply method now accepts args and kwargs for the function.</li> <li>Refactor of the evaluate method.</li> <li>Refactor of the export_pipeline method.</li> <li>The parameters in the Cleaner class have been refactored to better describe   their function.</li> <li>The <code>train_sizes</code> parameter in train_sizing now accepts integer   values to automatically create equally distributed splits in the training set.</li> <li>Refactor of plot_pipeline to show models in the diagram as well.</li> <li>Refactor of the <code>bagging</code> parameter to the (more appropriate) name <code>n_bootstrap</code>.</li> <li>New option to exclude columns from a transformer adding <code>!</code> before their name.</li> <li>Fixed a bug where the Pruner class failed if there were categorical columns   in the dataset.</li> <li>Completely reworked documentation website.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-440", "title": "Version 4.4.0", "text": "<ul> <li>New apply method to perform data transformations   as function to the pipeline</li> <li>Added the status method to save an overview of   atom's branches and models to the logger.</li> <li>Improved the output messages for the Imputer class.</li> <li>The dataset's columns can now be called directly from atom.</li> <li>The distribution and plot_distribution   methods now ignore missing values.</li> <li>Fixed a bug where transformations could fail when columns were added to the   dataset after initializing the pipeline.</li> <li>Fixed a bug where the Cleaner class didn't drop   columns consisting entirely of missing values when <code>drop_min_cardinality=True</code>.</li> <li>Fixed a bug where the winning model wasn't displayed correctly.</li> <li>Refactored the way transformers are added or removed from predicting methods.</li> <li>Improved documentation.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-430", "title": "Version 4.3.0", "text": "<ul> <li>Possibility to add custom transformers to the pipeline.</li> <li>The export_pipeline utility method exports   atom's current pipeline to a sklearn object.</li> <li>New magic methods makes atom behave similarly to sklearn's Pipeline.</li> <li>All training approaches can now be combined in the same atom instance.</li> <li>New plot_relationships, plot_distribution and plot_qq plots for data inspection.</li> <li>Complete rework of all the shap plots to be consistent with their new API.</li> <li>Improvements for the Scaler and [Pruner]([] classes.</li> <li>The acronym for custom models now defaults to the capital letters in the class' __name__.</li> <li>Possibility to apply transformations on only a subset of the columns.</li> <li>Plots and methods now accept <code>winner</code> as model name.</li> <li>Fixed a bug where custom metrics didn't show the correct name.</li> <li>Fixed a bug where timers were not displayed correctly.</li> <li>Further compatibility with deep learning datasets.</li> <li>Large refactoring for performance optimization.</li> <li>Cleaner output of messages to the logger.</li> <li>Plots no longer show a default title.</li> <li>Minor bug fixes.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-421", "title": "Version 4.2.1", "text": "<ul> <li>Bug fix where there was memory leakage in successive halving   and train sizing pipelines.</li> <li>Improved documentation.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-420", "title": "Version 4.2.0", "text": "<ul> <li>Possibility to add custom models to the pipeline using ATOMModel.</li> <li>Compatibility with deep learning models.</li> <li>New branch system for different data pipelines. Read more in the user guide.</li> <li>Use the canvas contextmanager to draw multiple plots in one figure.</li> <li>New voting and stacking ensemble techniques.</li> <li>New get_class_weight utility method.</li> <li>New Sequential Feature Selection strategy for the FeatureSelector.</li> <li>Added the <code>sample_weight</code> parameter to the score method.</li> <li>New ways to initialize the data in the <code>training</code> instances.</li> <li>The <code>test_size</code> parameter now also allows integer values.</li> <li>Renamed categories to classes to be consistent with sklearn's API.</li> <li>The class property now returns a pd.DataFrame of the number of rows per target class   in the train, test and complete dataset.</li> <li>Possibility to add custom parameters to an estimator's fit method through <code>est_params</code>.</li> <li>The successive halving and train sizing approaches now both allow subsequent   runs from atom without losing the information from previous runs.</li> <li>Bug fix where ATOMLoader wouldn't encode the target column during transformation.</li> <li>Added the Deep learning, Ensembles   and Utilities example notebooks.</li> <li>Support for python 3.9.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-410", "title": "Version 4.1.0", "text": "<ul> <li>New <code>est_params</code> parameter to customize the parameters in every model's estimator.</li> <li>Following skopt's API, the <code>n_random_starts</code> parameter to specify the number   of random trials is deprecated in favour of <code>n_initial_points</code>.</li> <li>The Balancer class now allows you to use any of the   strategies from imblearn.</li> <li>New utility attributes to inspect the dataset.</li> <li>Four new models: CatNB, CNB, ARD and RNN.</li> <li>Added the models section to the documentation.</li> <li>Small changes in log outputs.</li> <li>Bug fixes and performance improvements.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-401", "title": "Version 4.0.1", "text": "<ul> <li>Bug fix where the FeatureGenerator was not deterministic for a fixed random state.</li> <li>Bug fix where subsequent runs with the same metric failed.</li> <li>Added the license file to the package's installer.</li> <li>Typo fixes in documentation.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-400", "title": "Version 4.0.0", "text": "<ul> <li>Bayesian optimization package changed from GpyOpt   to skopt.</li> <li>Complete revision of the model's hyperparameters.</li> <li>Four SHAP plots can now be called directly from an ATOM pipeline.</li> <li>Two new plots for regression tasks.</li> <li>New plot_pipeline and <code>pipeline</code> attribute to access all transformers. </li> <li>Possibility to determine transformer parameters per method.</li> <li>New calibrate and plot_calibration methods.</li> <li>Metrics can now be added as scorers or functions with signature metric(y, y_pred, **kwargs).</li> <li>Implementation of multi-metric runs.</li> <li>Possibility to choose which metric to plot.</li> <li>Early stopping for models that allow in-training validation.</li> <li>Added the <code>ATOMLoader</code> function to load any saved pickle instance.</li> <li>The \"remove\" strategy in the data cleaning parameters is deprecated in favour of \"drop\".</li> <li>Implemented the dfs strategy in FeatureGenerator.</li> <li>All training classes now inherit from BaseEstimator.</li> <li>Added multiple new example notebooks.</li> <li>Tests coverage up to 100%.</li> <li>Completely new documentation page.</li> <li>Bug fixes and performance improvements.</li> </ul>"}, {"location": "changelog/v5.x.x/", "title": "Release history", "text": ""}, {"location": "changelog/v5.x.x/#version-600", "title": "Version 6.0.0", "text": "<p> New features</p> <ul> <li>Completely new module for time series. Read more in the user guide.</li> <li>Support for Python 3.11 and drop support for Python 3.8   and Python 3.9.</li> <li>New data engines. Read more in the user guide.</li> <li>Improved memory optimizations. Read more in the user guide.</li> <li>Added the <code>iterative</code> strategy for numerical imputation.</li> <li>New update_traces method to further customize your plots.</li> </ul> <p> API changes</p> <ul> <li>The FeatureGrouper class no longer accepts a <code>name</code> parameter. Provide   the group names directly through the <code>group</code> parameter as dict.</li> <li>Rework of the register method.</li> <li>The <code>multioutput</code> attribute is deprecated. Multioutput meta-estimators are   now assigned automatically.</li> <li>Model tags have to be separated from the acronym by an underscore.</li> <li>The <code>engine</code> parameter is now a dict.</li> <li>The <code>automl</code> method is deprecated.</li> </ul> <p> Enhancements</p> <ul> <li>Transformations only on <code>y</code> are now accepted, e.g., <code>atom.scale(columns=-1)</code>.</li> <li>Full support for pandas nullable dtypes.</li> <li>The dataset can now be provided as callable.</li> <li>The save and save_data   methods now accept pathlib.Path objects as <code>filename</code>.</li> <li>Cleaner representation on hover for the plot_timeline method.</li> <li>Added the <code>hdbscan</code> strategy to the Pruner class.</li> <li>The <code>cv</code> key in <code>ht_params</code> now accepts a custom cross-validation generator.</li> <li>Improved error message for incorrect stratification of multioutput datasets.</li> <li>Rework of the shrink method.</li> </ul> <p> Bug fixes</p> <ul> <li>Fixed a bug where the cross_validate method could   fail for pipelines that changed the number of rows.</li> <li>Fixed a bug where the Pruner class didn't drop all outlier clusters.</li> <li>Fixed a bug where the pipeline could fail for transformers that returned a   series.</li> <li>Fixed a bug where the pipeline could fail for transformers that reset its   internal attributes during fitting.</li> <li>Fixed a bug where the register method failed in Databricks.</li> <li>Fixed a bug where tuning hyperparameter for a <code>base_estimator</code> inside a custom   meta-estimator would fail.</li> <li>Fixed a bug where the data properties' <code>@setter</code> could fail for numpy arrays.</li> </ul> <p></p>"}, {"location": "changelog/v5.x.x/#version-520", "title": "Version 5.2.0", "text": "<p> New features</p> <ul> <li>Two new plot methods: plot_terminator_improvement and plot_timeline.</li> </ul> <p> Enhancements</p> <ul> <li>Data splits in every trial are now properly stratified according to the   selected strategy.</li> <li>Performance optimization for multiple methods using smart caching.</li> <li>Improved visualizations for plots with logarithmic hyperparameters.</li> </ul> <p> Bug fixes</p> <ul> <li>Fixed a bug where parameters in a trial would not match with those displayed.</li> </ul> <p></p>"}, {"location": "changelog/v5.x.x/#version-512", "title": "Version 5.1.2", "text": "<p> API changes</p> <ul> <li>The default <code>strategy</code> for the <code>encode</code> method has   changed from \"LeaveOneOut\" to \"Target\"-encoding. LeaveOneOut is no longer a   supported strategy.</li> </ul> <p> Bug fixes</p> <ul> <li>Fixed a bug where stratification failed for datasets where the target column was   not placed last.</li> <li>Fixed a bug where transformers with no <code>get_feature_names_out</code> method could fail.</li> <li>Fixed a bug where the FeatureSelector class could fail when transforming a   dataset with different column order than seen at fit time.</li> </ul> <p></p>"}, {"location": "changelog/v5.x.x/#version-511", "title": "Version 5.1.1", "text": "<p> API changes</p> <ul> <li>The <code>infrequent_to_value</code> parameter in the Encoder class is replaced with   <code>infrequent_to_value</code> to be consistent with sklearn's naming convention.</li> </ul> <p> Enhancements</p> <ul> <li>Added the <code>kwargs</code> parameter to the save_data method.</li> </ul> <p> Bug fixes</p> <ul> <li>Fixed an installation issue for systems without an x86 architecture.</li> <li>Fixed a bug where Voting would fail for certain metrics.</li> <li>Fixed a bug where the time metric in mlflow was always zero.</li> <li>Fixed a bug where shap plots wouldn't display the full column names.</li> <li>Fixed a bug where column names where not properly propagated during   transformation.</li> </ul> <p></p>"}, {"location": "changelog/v5.x.x/#version-510", "title": "Version 5.1.0", "text": "<p> New features</p> <ul> <li>Support for multilabel classification, multiclass-multilabel classification   and multioutput regression tasks. Read more in the user guide.</li> <li>New backend parameter to choose a parallel execution   backend.</li> <li>New <code>parallel</code> parameter to train multiple models   simultaneously.</li> <li>Integration with DAGsHub to store your mlflow experiments.   Read more in the user guide.</li> <li>New serve method to deploy models to a rest API endpoint.</li> <li>New get_best_threshold method to calculate the   optimal threshold for binary and multilabel tasks.</li> <li>New get_sample_weight method to calculate   the sample weights for a balanced data set.</li> </ul> <p> API changes</p> <ul> <li>The <code>ATOMLoader</code> class is deprecated in favor of the load method.</li> <li>The <code>errors</code> attribute for runners is deprecated.</li> </ul> <p> Enhancements</p> <ul> <li>Added three new notebook examples.</li> <li>Added the <code>drop_chars</code> parameter to the Cleaner class.</li> <li>Added the <code>errors</code> parameter to the trainers.</li> <li>Rework of the dependencies, making the base package more lightweight.</li> <li>The logging entries for external libraries are redirected to atom's   file handler.</li> </ul> <p> Bug fixes</p> <ul> <li>Fixed multiple errors that appeared after sklearn's 1.2 update.</li> <li>Fixed a bug where hyperparameter tuning could fail for multi-metric runs.</li> <li>Fixed a bug where trials would try to report multiple times the same step.</li> <li>Fixed a bug where custom models could skip in-training validation.</li> <li>Fixed an issue where the bootstrapping estimators were trained using   <code>partial_fit</code>.</li> </ul> <p></p>"}, {"location": "changelog/v5.x.x/#version-501", "title": "Version 5.0.1", "text": "<p> Bug fixes</p> <ul> <li>Fixed installation issue.</li> <li>Updated package dependencies.</li> </ul> <p></p>"}, {"location": "changelog/v5.x.x/#version-500", "title": "Version 5.0.0", "text": "<p> New features</p> <ul> <li>Completely new hyperparameter tuning process.</li> <li>Completely reworked plotting interface.</li> <li>Accelerate your pipelines with sklearnex.</li> <li>New FeatureGrouper class to extract statistical features from   similar groups.</li> <li>New create_app method to create a nice front-end   for model predictions.</li> <li>New inverse_transform method for   atom and models.</li> <li>New linear model: OrthogonalMatchingPursuit.</li> <li>The plot_results method now accepts time metrics.</li> </ul> <p> API changes</p> <ul> <li>The <code>gpu</code> parameter is deprecated in favor of <code>device</code>   and <code>engine</code>.</li> <li>Refactor of the Cleaner, Discretizer, Encoder and FeatureSelector   classes.</li> <li>Refactor of all shap plots.</li> <li>Refactor of the apply method.</li> <li>The <code>plot_scatter_matrix</code> method is renamed to plot_relationships.</li> <li>The <code>kSVM</code> model is renamed to SVM.</li> <li>Multidimensional datasets are no longer supported. Check the deep learning   section of the user guide for guidance with such datasets.</li> <li>The <code>greater_is_better</code>, <code>needs_proba</code> and <code>needs_threshold</code> parameters are   deprecated. Metric functions are now created using make_scorer's   default parameters.</li> <li>The <code>drop</code> method is removed from atom. Use the reworked apply   method instead.</li> <li>The prediction methods can no longer be called from atom.</li> <li>The dashboard method for models is now called create_dashboard.</li> </ul> <p> Enhancements</p> <ul> <li>New examples for plotting, automated feature scaling,   pruning and advanced hyperparameter tuning.</li> <li>The Normalizer class can now be accelerated with GPU.</li> <li>The Scaler class now ignores binary columns (only 0s and 1s).</li> <li>The <code>models</code> parameter in plot and utility methods now accepts model indices.</li> <li>The transform method now also transforms   only <code>y</code> when <code>X</code> has a default value.</li> <li>The prediction methods now return pandas objects.</li> <li>Dependency versions are checked with originals after unpickling.</li> <li>Automatic generation of documentation from docstrings.</li> <li>Improvements in documentation display for mobile phones.</li> <li>New <code>feature_importance</code> attribute for models.</li> <li>Added a visualization for automated feature scaling to plot_pipeline.</li> </ul> <p> Bug fixes</p> <ul> <li>The FeatureExtractor class no longer raises a warning for highly   fragmented dataframes.</li> <li>Fixed a bug where models could not call the score function.</li> <li>The Encoder class no longer fails when the user provides ordinal   values that are not present during fitting.</li> <li>Fixed a bug with the <code>max_nan_rows</code> parameter in the Imputer class.</li> <li>Fixed a bug where Tokenizer could fail when no ngrams were found.</li> </ul>"}, {"location": "examples/accelerating_cuml/", "title": "Accelerating cuml", "text": "In\u00a0[1]: Copied! <pre>from atom import ATOMClassifier\nfrom sklearn.datasets import make_classification\n\n# Create a dummy dataset\nX, y = make_classification(n_samples=100000, n_features=40)\n</pre> from atom import ATOMClassifier from sklearn.datasets import make_classification  # Create a dummy dataset X, y = make_classification(n_samples=100000, n_features=40) In\u00a0[2]: Copied! <pre>atom = ATOMClassifier(X, y, device=\"gpu\", engine=\"cuml\", verbose=2)\n</pre> atom = ATOMClassifier(X, y, device=\"gpu\", engine=\"cuml\", verbose=2) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\nAlgorithm task: binary classification.\nGPU training enabled.\nBackend engine: cuml.\n\nDataset stats ==================== &gt;&gt;\nShape: (100000, 41)\nMemory: 32.80 MB\nScaled: True\nOutlier values: 8127 (0.2%)\n-------------------------------------\nTrain set size: 80000\nTest set size: 20000\n-------------------------------------\n|   |       dataset |         train |          test |\n| - | ------------- | ------------- | ------------- |\n| 0 |   50006 (1.0) |   40005 (1.0) |   10001 (1.0) |\n| 1 |   49994 (1.0) |   39995 (1.0) |    9999 (1.0) |\n\n</pre> In\u00a0[3]: Copied! <pre>atom.scale()\n</pre> atom.scale() <pre>Fitting Scaler...\nScaling features...\n</pre> In\u00a0[13]: Copied! <pre>atom.dataset\n</pre> atom.dataset Out[13]: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x31 x32 x33 x34 x35 x36 x37 x38 x39 target 0 2.021646 -0.634557 -0.867811 1.103642 1.559011 0.122284 -0.864821 1.411657 0.147997 -2.269082 ... -0.489864 1.861048 -0.353861 0.720823 -1.522117 -0.737707 -1.573936 -0.832174 0.203154 0 1 -0.019885 0.846568 -0.364059 -1.091604 -1.336692 0.186689 -0.274142 0.020563 0.693235 -1.908658 ... -1.610058 -0.365231 0.284908 0.170156 -0.236553 -0.573761 -0.107317 -2.480178 0.420341 0 2 0.516618 -0.013420 -0.753879 -0.488243 0.560051 0.395817 -0.522523 -1.083503 -0.073398 0.383061 ... 0.966283 1.405546 -0.658654 0.339090 -1.615997 -1.312444 0.984578 0.602858 -1.110684 1 3 0.111861 -0.966334 0.208509 0.494328 -0.766835 -0.003399 -0.500449 -0.530622 -0.481663 -1.146132 ... -0.304896 2.030211 -1.189488 -1.238600 1.658765 -0.255644 0.572194 0.195496 0.617734 1 4 0.160135 -0.873517 0.719142 -2.020767 0.421435 -1.941230 0.835615 -1.178845 0.235273 -0.328574 ... 1.633662 -0.631118 1.814046 1.031754 0.328665 1.704483 2.153710 -1.430552 -0.543915 1 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 99995 1.100240 0.092581 -0.346265 0.234024 0.590199 0.755019 -1.688456 -1.031070 -0.620193 -0.283336 ... 0.356480 1.346821 -0.299087 2.343587 -2.003646 -0.933179 0.764255 -0.233526 -1.462311 1 99996 -1.142596 0.321843 -0.974006 0.390418 0.404722 -0.324256 -0.288176 1.009458 0.860912 -0.191313 ... 0.044618 -2.030135 1.448640 -0.854798 1.441451 1.347461 -0.937607 0.572504 -0.787673 0 99997 1.658252 0.303637 -0.020324 0.225917 0.154092 -1.208507 -0.199919 1.063016 -0.395696 -0.060886 ... 1.563345 -1.261853 -0.810122 -0.503823 1.565602 -1.264792 -0.591644 1.588397 0.601721 0 99998 -0.288042 -1.139792 1.548338 0.501413 0.361604 -0.315720 -0.564607 1.500870 0.501768 0.649079 ... 0.344663 1.734476 0.660177 0.767554 1.461940 0.310189 -1.469978 0.900132 1.114330 0 99999 -3.093351 -0.636463 -0.449575 1.169980 -1.041870 -0.257173 2.072777 -0.101111 -0.956916 -0.251162 ... 2.250647 0.746250 -0.610311 0.445467 -0.636288 -0.187444 0.226108 -0.186927 -1.024960 1 <p>100000 rows \u00d7 41 columns</p> In\u00a0[4]: Copied! <pre>print(f\"Scaler used: {atom.standard}\")\nprint(f\"Scaler's module: {atom.standard.__class__.__module__}\")\n</pre> print(f\"Scaler used: {atom.standard}\") print(f\"Scaler's module: {atom.standard.__class__.__module__}\") <pre>Scaler used: StandardScaler()\nScaler's module: cuml._thirdparty.sklearn.preprocessing._data\n</pre> In\u00a0[5]: Copied! <pre>atom.run(models=[\"RF\", \"SGD\", \"XGB\"])\n</pre> atom.run(models=[\"RF\", \"SGD\", \"XGB\"]) <pre>\nTraining ========================= &gt;&gt;\nModels: RF, SGD, XGB\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9726\nTest evaluation --&gt; f1: 0.9431\nTime elapsed: 1.935s\n-------------------------------------------------\nTotal time: 1.935s\n\n\nResults for StochasticGradientDescent:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9236\nTest evaluation --&gt; f1: 0.9219\nTime elapsed: 02m:16s\n-------------------------------------------------\nTotal time: 02m:16s\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9749\nTest evaluation --&gt; f1: 0.9437\nTime elapsed: 6.394s\n-------------------------------------------------\nTotal time: 6.394s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 02m:24s\n-------------------------------------\nRandomForest              --&gt; f1: 0.9431\nStochasticGradientDescent --&gt; f1: 0.9219\nXGBoost                   --&gt; f1: 0.9437 !\n</pre> In\u00a0[6]: Copied! <pre>atom.results\n</pre> atom.results Out[6]: score_train score_test time_fit time RF 0.9726 0.9431 1.934512 1.934512 SGD 0.9236 0.9219 135.871493 135.871493 XGB 0.9749 0.9437 6.394416 6.394416 In\u00a0[7]: Copied! <pre>for m in atom.models:\n    print(f\"{m}'s module: {atom[m].estimator.__class__.__module__}\")\n</pre> for m in atom.models:     print(f\"{m}'s module: {atom[m].estimator.__class__.__module__}\") <pre>RF's module: cuml.ensemble.randomforestclassifier\nSGD's module: sklearn.linear_model._stochastic_gradient\nXGB's module: xgboost.sklearn\n</pre> In\u00a0[8]: Copied! <pre>atom.evaluate()\n</pre> atom.evaluate() Out[8]: accuracy average_precision balanced_accuracy f1 jaccard matthews_corrcoef precision recall roc_auc RF 0.9429 0.9741 0.9429 0.9431 0.8924 0.8858 0.9391 0.9472 0.9792 SGD 0.9217 0.9635 0.9218 0.9219 0.8551 0.8435 0.9203 0.9235 0.9676 XGB 0.9434 0.9753 0.9434 0.9437 0.8933 0.8868 0.9385 0.9489 0.9798"}, {"location": "examples/accelerating_cuml/#example-accelerating-pipelines-on-gpu", "title": "Example: Accelerating pipelines on GPU\u00b6", "text": "<p>This example shows how to accelerate a pipeline on GPU using cuML.</p> <p>The data used is a synthetic dataset created using sklearn's make_classification function.</p>"}, {"location": "examples/accelerating_sklearnex/", "title": "Accelerating sklearnex", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X, \"RainTomorrow\", verbose=2)\n</pre> atom = ATOMClassifier(X, \"RainTomorrow\", verbose=2) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\n</pre> In\u00a0[4]: Copied! <pre># Impute missing values and encode categorical columns\natom.impute()\natom.encode()\n</pre> # Impute missing values and encode categorical columns atom.impute() atom.encode() <pre>Fitting Imputer...\nImputing missing values...\n --&gt; Dropping 637 samples due to missing values in feature MinTemp.\n --&gt; Dropping 322 samples due to missing values in feature MaxTemp.\n --&gt; Dropping 1406 samples due to missing values in feature Rainfall.\n --&gt; Dropping 60843 samples due to missing values in feature Evaporation.\n --&gt; Dropping 67816 samples due to missing values in feature Sunshine.\n --&gt; Dropping 9330 samples due to missing values in feature WindGustDir.\n --&gt; Dropping 9270 samples due to missing values in feature WindGustSpeed.\n --&gt; Dropping 10013 samples due to missing values in feature WindDir9am.\n --&gt; Dropping 3778 samples due to missing values in feature WindDir3pm.\n --&gt; Dropping 1348 samples due to missing values in feature WindSpeed9am.\n --&gt; Dropping 2630 samples due to missing values in feature WindSpeed3pm.\n --&gt; Dropping 1774 samples due to missing values in feature Humidity9am.\n --&gt; Dropping 3610 samples due to missing values in feature Humidity3pm.\n --&gt; Dropping 14014 samples due to missing values in feature Pressure9am.\n --&gt; Dropping 13981 samples due to missing values in feature Pressure3pm.\n --&gt; Dropping 53657 samples due to missing values in feature Cloud9am.\n --&gt; Dropping 57094 samples due to missing values in feature Cloud3pm.\n --&gt; Dropping 904 samples due to missing values in feature Temp9am.\n --&gt; Dropping 2726 samples due to missing values in feature Temp3pm.\n --&gt; Dropping 1406 samples due to missing values in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 26 classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n</pre> In\u00a0[5]: Copied! <pre># Train a K-Nearest Neighbors model (using default sklearn)\natom.run(models=\"KNN\", metric=\"f1\")\n</pre> # Train a K-Nearest Neighbors model (using default sklearn) atom.run(models=\"KNN\", metric=\"f1\") <pre>\nTraining ========================= &gt;&gt;\nModels: KNN\nMetric: f1\n\n\nResults for KNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.7135\nTest evaluation --&gt; f1: 0.5904\nTime elapsed: 4.239s\n-------------------------------------------------\nTime: 4.239s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 8.264s\n-------------------------------------\nKNearestNeighbors --&gt; f1: 0.5904\n</pre> In\u00a0[7]: Copied! <pre># Now, we train an accelerated KNN using engine=\"sklearnex\"\n# Note the diffrence in training speed!!\natom.run(models=\"KNN_acc\", metric=\"f1\", engine={\"estimator\": \"sklearnex\"})\n</pre> # Now, we train an accelerated KNN using engine=\"sklearnex\" # Note the diffrence in training speed!! atom.run(models=\"KNN_acc\", metric=\"f1\", engine={\"estimator\": \"sklearnex\"}) <pre>\nTraining ========================= &gt;&gt;\nModels: KNN_acc\nMetric: f1\n\n\nResults for KNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.7135\nTest evaluation --&gt; f1: 0.5904\nTime elapsed: 1.185s\n-------------------------------------------------\nTime: 1.185s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 2.226s\n-------------------------------------\nKNearestNeighbors --&gt; f1: 0.5904\n</pre> In\u00a0[8]: Copied! <pre>atom.results\n</pre> atom.results Out[8]: f1_train f1_test time_fit time KNN 0.7135 0.5904 4.238729 4.238729 KNN_acc 0.7135 0.5904 1.184578 1.184578 In\u00a0[9]: Copied! <pre># Note how the underlying estimators might look the same...\nprint(atom.knn.estimator)\nprint(atom.knn_acc.estimator)\n\n# ... but are using different implementations\nprint(atom.knn.estimator.__module__)\nprint(atom.knn_acc.estimator.__module__)\n</pre> # Note how the underlying estimators might look the same... print(atom.knn.estimator) print(atom.knn_acc.estimator)  # ... but are using different implementations print(atom.knn.estimator.__module__) print(atom.knn_acc.estimator.__module__) <pre>KNeighborsClassifier(n_jobs=1)\nKNeighborsClassifier(n_jobs=1)\nsklearn.neighbors._classification\nsklearnex.neighbors.knn_classification\n</pre> In\u00a0[10]: Copied! <pre>with atom.canvas(1, 2, title=\"Timing engines: sklearn vs sklearnex\"):\n    atom.plot_results(metric=\"time_fit\", title=\"Training\")\n    atom.plot_results(metric=\"time\", title=\"Total\")\n</pre> with atom.canvas(1, 2, title=\"Timing engines: sklearn vs sklearnex\"):     atom.plot_results(metric=\"time_fit\", title=\"Training\")     atom.plot_results(metric=\"time\", title=\"Total\")"}, {"location": "examples/accelerating_sklearnex/#example-accelerating-pipelines", "title": "Example: Accelerating pipelines\u00b6", "text": "<p>This example shows how to accelerate your models on cpu using sklearnex.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/accelerating_sklearnex/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/accelerating_sklearnex/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/accelerating_sklearnex/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/advanced_plotting/", "title": "Advanced plotting", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=1)\natom.impute()\natom.encode()\n</pre> atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=1) atom.impute() atom.encode() <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nFitting Imputer...\nImputing missing values...\nFitting Encoder...\nEncoding categorical columns...\n</pre> In\u00a0[4]: Copied! <pre># Let's see how the default aesthetics looks like\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n</pre> # Let's see how the default aesthetics looks like atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[5]: Copied! <pre># Change the color palette using color names or their hex codes\natom.palette = [\"red\", \"#00f\"]\n</pre> # Change the color palette using color names or their hex codes atom.palette = [\"red\", \"#00f\"] In\u00a0[6]: Copied! <pre>atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n</pre> atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[7]: Copied! <pre># Change the title and label fontsize\natom.title_fontsize = 30\natom.label_fontsize = 24\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n</pre> # Change the title and label fontsize atom.title_fontsize = 30 atom.label_fontsize = 24 atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[8]: Copied! <pre># Use the update_layout method to change layout properties\natom.update_layout(template=\"simple_white\", barmode=\"group\", hovermode=\"x\")\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n</pre> # Use the update_layout method to change layout properties atom.update_layout(template=\"simple_white\", barmode=\"group\", hovermode=\"x\") atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[9]: Copied! <pre># Use the update_traces method to change the trace (note the y-axis)\natom.update_traces(histnorm=\"percent\", selector=dict(type=\"histogram\"))\natom.plot_distribution(columns=[1, 2], distributions=None, title=\"Distribution of temperatures\")\n</pre> # Use the update_traces method to change the trace (note the y-axis) atom.update_traces(histnorm=\"percent\", selector=dict(type=\"histogram\")) atom.plot_distribution(columns=[1, 2], distributions=None, title=\"Distribution of temperatures\") In\u00a0[10]: Copied! <pre># Let's go back to the default aesthetics\natom.reset_aesthetics()\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n</pre> # Let's go back to the default aesthetics atom.reset_aesthetics() atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[11]: Copied! <pre># And update the title with some custom fonts\natom.plot_distribution(\n    columns=[1, 2],\n    title=dict(\n        text=\"Distribution of temperatures\",\n        font_color=\"teal\",\n        x=0,\n        xanchor=\"left\",\n    )\n)\n</pre> # And update the title with some custom fonts atom.plot_distribution(     columns=[1, 2],     title=dict(         text=\"Distribution of temperatures\",         font_color=\"teal\",         x=0,         xanchor=\"left\",     ) ) In\u00a0[12]: Copied! <pre># We can update the legend in a similar fashion\natom.plot_distribution(\n    columns=[1, 2],\n    title=dict(\n        text=\"Distribution of temperatures\",\n        font_color=\"teal\",\n        x=0,\n        xanchor=\"left\",\n    ),\n    legend=dict(title=\"Legend's title\"),\n)\n</pre> # We can update the legend in a similar fashion atom.plot_distribution(     columns=[1, 2],     title=dict(         text=\"Distribution of temperatures\",         font_color=\"teal\",         x=0,         xanchor=\"left\",     ),     legend=dict(title=\"Legend's title\"), ) In\u00a0[13]: Copied! <pre>atom.run(\"LR\")\n\n# You can plot the ROC curve for a selection of rows,\n# for example, for rows in a specific location\natom.plot_roc(\n    rows={\n        \"Portland\": atom.test.loc[atom.og.X.Location == \"Portland\"],\n        \"Sydney\": atom.test.loc[atom.og.X.Location == \"Sydney\"],\n    }\n)\n</pre> atom.run(\"LR\")  # You can plot the ROC curve for a selection of rows, # for example, for rows in a specific location atom.plot_roc(     rows={         \"Portland\": atom.test.loc[atom.og.X.Location == \"Portland\"],         \"Sydney\": atom.test.loc[atom.og.X.Location == \"Sydney\"],     } ) <pre>\nTraining ========================= &gt;&gt;\nModels: LR\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6247\nTest evaluation --&gt; f1: 0.6093\nTime elapsed: 0.636s\n-------------------------------------------------\nTime: 0.636s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.044s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.6093\n</pre> In\u00a0[14]: Copied! <pre># Note how the same column over different plots is grouped\nwith atom.canvas(2, 2):\n    atom.plot_distribution(columns=1)\n    atom.plot_distribution(columns=2)\n    atom.plot_qq(columns=[1, 2], distributions=[\"norm\", \"invgauss\"])\n    atom.plot_qq(columns=[1, 2])\n</pre> # Note how the same column over different plots is grouped with atom.canvas(2, 2):     atom.plot_distribution(columns=1)     atom.plot_distribution(columns=2)     atom.plot_qq(columns=[1, 2], distributions=[\"norm\", \"invgauss\"])     atom.plot_qq(columns=[1, 2])"}, {"location": "examples/advanced_plotting/#example-advanced-plotting", "title": "Example: Advanced plotting\u00b6", "text": "<p>This example shows how to make the best use of all of atom's plotting options.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/advanced_plotting/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-colors-and-font-size", "title": "Customize colors and font size\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-the-plots-layout", "title": "Customize the plot's layout\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-the-plots-traces", "title": "Customize the plot's traces\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-the-title-and-legend", "title": "Customize the title and legend\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customizing-the-rows-to-plot", "title": "Customizing the rows to plot\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#using-a-canvas", "title": "Using a canvas\u00b6", "text": ""}, {"location": "examples/automated_feature_scaling/", "title": "Automated feature scaling", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom atom import ATOMClassifier\n</pre> # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX, y = load_breast_cancer(return_X_y=True)\n</pre> # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X, y, verbose=2, random_state=1)\n</pre> atom = ATOMClassifier(X, y, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n</pre> In\u00a0[4]: Copied! <pre># Check which models require feature scaling\natom.available_models()[[\"acronym\", \"model\", \"needs_scaling\"]]\n</pre> # Check which models require feature scaling atom.available_models()[[\"acronym\", \"model\", \"needs_scaling\"]] Out[4]: acronym model needs_scaling 0 AdaB AdaBoost False 1 Bag Bagging False 2 BNB BernoulliNB False 3 CatB CatBoost True 4 CatNB CategoricalNB False 5 CNB ComplementNB False 6 Tree DecisionTree False 7 Dummy Dummy False 8 ETree ExtraTree False 9 ET ExtraTrees False 10 GNB GaussianNB False 11 GP GaussianProcess False 12 GBM GradientBoostingMachine False 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 LGB LightGBM True 16 LDA LinearDiscriminantAnalysis False 17 lSVM LinearSVM True 18 LR LogisticRegression True 19 MLP MultiLayerPerceptron True 20 MNB MultinomialNB False 21 PA PassiveAggressive True 22 Perc Perceptron True 23 QDA QuadraticDiscriminantAnalysis False 24 RNN RadiusNearestNeighbors True 25 RF RandomForest False 26 Ridge Ridge True 27 SGD StochasticGradientDescent True 28 SVM SupportVectorMachine True 29 XGB XGBoost True In\u00a0[5]: Copied! <pre># We fit two models: LR needs scaling and Bag doesn't\natom.run([\"LR\", \"Bag\"])\n</pre> # We fit two models: LR needs scaling and Bag doesn't atom.run([\"LR\", \"Bag\"]) <pre>\nTraining ========================= &gt;&gt;\nModels: LR, Bag\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9913\nTest evaluation --&gt; f1: 0.9861\nTime elapsed: 0.051s\n-------------------------------------------------\nTime: 0.051s\n\n\nResults for Bagging:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9982\nTest evaluation --&gt; f1: 0.9444\nTime elapsed: 0.111s\n-------------------------------------------------\nTime: 0.111s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.216s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9861 !\nBagging            --&gt; f1: 0.9444\n</pre> In\u00a0[6]: Copied! <pre># Now, we create a new branch and scale the features before fitting the model\natom.branch = \"scaling\"\n</pre> # Now, we create a new branch and scale the features before fitting the model atom.branch = \"scaling\" <pre>Successfully created new branch: scaling.\n</pre> In\u00a0[7]: Copied! <pre>atom.scale()\n</pre> atom.scale() <pre>Fitting Scaler...\nScaling features...\n</pre> In\u00a0[8]: Copied! <pre>atom.run(\"LR_2\")\n</pre> atom.run(\"LR_2\") <pre>\nTraining ========================= &gt;&gt;\nModels: LR_2\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9913\nTest evaluation --&gt; f1: 0.9861\nTime elapsed: 0.035s\n-------------------------------------------------\nTime: 0.035s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.057s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9861\n</pre> In\u00a0[9]: Copied! <pre># Let's compare the differences between the models\nprint(atom.lr.scaler)\nprint(atom.bag.scaler)\nprint(atom.lr_2.scaler)\n</pre> # Let's compare the differences between the models print(atom.lr.scaler) print(atom.bag.scaler) print(atom.lr_2.scaler) <pre>Scaler()\nNone\nNone\n</pre> In\u00a0[10]: Copied! <pre># And the data they use is different\nprint(atom.lr.X.iloc[:5, :3])\nprint(\"-----------------------------\")\nprint(atom.bag.X.iloc[:5, :3])\nprint(\"-----------------------------\")\nprint(atom.lr_2.X_train.equals(atom.lr.X_train))\n</pre> # And the data they use is different print(atom.lr.X.iloc[:5, :3]) print(\"-----------------------------\") print(atom.bag.X.iloc[:5, :3]) print(\"-----------------------------\") print(atom.lr_2.X_train.equals(atom.lr.X_train)) <pre>         x0        x1        x2\n0 -0.181875  0.356669 -0.147122\n1  1.162216  0.300578  1.159704\n2  1.056470  1.212060  0.933833\n3  0.277287  2.457753  0.188054\n4 -1.442482 -0.825921 -1.343434\n-----------------------------\n      x0     x1      x2\n0  13.48  20.82   88.40\n1  18.31  20.58  120.80\n2  17.93  24.48  115.20\n3  15.13  29.81   96.71\n4   8.95  15.76   58.74\n-----------------------------\nTrue\n</pre> In\u00a0[11]: Copied! <pre># Note that the scaler is included in the model's pipeline\nprint(atom.lr.pipeline)\nprint(\"-----------------------------\")\nprint(atom.bag.pipeline)\nprint(\"-----------------------------\")\nprint(atom.lr_2.pipeline)\n</pre> # Note that the scaler is included in the model's pipeline print(atom.lr.pipeline) print(\"-----------------------------\") print(atom.bag.pipeline) print(\"-----------------------------\") print(atom.lr_2.pipeline) <pre>Pipeline(memory=Memory(location=None), steps=[('AutomatedScaler', Scaler())])\n-----------------------------\nPipeline(memory=Memory(location=None), steps=[])\n-----------------------------\nPipeline(memory=Memory(location=None), steps=[('Scaler', Scaler(verbose=2))])\n</pre> In\u00a0[12]: Copied! <pre>atom.plot_pipeline()\n</pre> atom.plot_pipeline()"}, {"location": "examples/automated_feature_scaling/#example-automated-feature-scaling", "title": "Example: Automated feature scaling\u00b6", "text": "<p>This example shows how ATOM handles models that require automated feature scaling.</p> <p>Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.</p>"}, {"location": "examples/automated_feature_scaling/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/automated_feature_scaling/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/automated_feature_scaling/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/binary_classification/", "title": "Binary classification", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre># Call atom using only 5% of the complete dataset (for explanatory purposes)\natom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=0.05, n_jobs=8, verbose=2)\n</pre> # Call atom using only 5% of the complete dataset (for explanatory purposes) atom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=0.05, n_jobs=8, verbose=2) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\nParallel processing with 8 cores.\nParallelization backend: loky\n\nDataset stats ==================== &gt;&gt;\nShape: (7109, 22)\nTrain set size: 5688\nTest set size: 1421\n-------------------------------------\nMemory: 1.25 MB\nScaled: False\nMissing values: 15868 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 1 (0.0%)\n\n</pre> In\u00a0[4]: Copied! <pre># Impute missing values\natom.impute(strat_num=\"median\", strat_cat=\"drop\", max_nan_rows=0.8)\n</pre> # Impute missing values atom.impute(strat_num=\"median\", strat_cat=\"drop\", max_nan_rows=0.8) <pre>Fitting Imputer...\nImputing missing values...\n --&gt; Dropping 7 samples for containing more than 16 missing values.\n --&gt; Imputing 23 missing values with median (11.9) in feature MinTemp.\n --&gt; Imputing 10 missing values with median (22.6) in feature MaxTemp.\n --&gt; Imputing 72 missing values with median (0.0) in feature Rainfall.\n --&gt; Imputing 3059 missing values with median (4.6) in feature Evaporation.\n --&gt; Imputing 3382 missing values with median (8.5) in feature Sunshine.\n --&gt; Dropping 467 samples due to missing values in feature WindGustDir.\n --&gt; Imputing 466 missing values with median (39.0) in feature WindGustSpeed.\n --&gt; Dropping 479 samples due to missing values in feature WindDir9am.\n --&gt; Dropping 165 samples due to missing values in feature WindDir3pm.\n --&gt; Imputing 53 missing values with median (13.0) in feature WindSpeed9am.\n --&gt; Imputing 115 missing values with median (17.0) in feature WindSpeed3pm.\n --&gt; Imputing 72 missing values with median (70.0) in feature Humidity9am.\n --&gt; Imputing 164 missing values with median (52.0) in feature Humidity3pm.\n --&gt; Imputing 699 missing values with median (1017.7) in feature Pressure9am.\n --&gt; Imputing 699 missing values with median (1015.4) in feature Pressure3pm.\n --&gt; Imputing 2698 missing values with median (5.0) in feature Cloud9am.\n --&gt; Imputing 2903 missing values with median (5.0) in feature Cloud3pm.\n --&gt; Imputing 32 missing values with median (16.7) in feature Temp9am.\n --&gt; Imputing 116 missing values with median (21.1) in feature Temp3pm.\n --&gt; Dropping 72 samples due to missing values in feature RainToday.\n</pre> In\u00a0[5]: Copied! <pre># Encode the categorical features\natom.encode(strategy=\"Target\", max_onehot=10, infrequent_to_value=0.04)\n</pre> # Encode the categorical features atom.encode(strategy=\"Target\", max_onehot=10, infrequent_to_value=0.04) <pre>Fitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 47 classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n</pre> In\u00a0[6]: Copied! <pre># Train an Extra-Trees and a Random Forest model\natom.run(models=[\"ET\", \"RF\"], metric=\"f1\", n_bootstrap=5)\n</pre> # Train an Extra-Trees and a Random Forest model atom.run(models=[\"ET\", \"RF\"], metric=\"f1\", n_bootstrap=5) <pre>\nTraining ========================= &gt;&gt;\nModels: ET, RF\nMetric: f1\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.5956\nTime elapsed: 1.414s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5709 \u00b1 0.0198\nTime elapsed: 1.020s\n-------------------------------------------------\nTime: 2.434s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.6124\nTime elapsed: 0.337s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5802 \u00b1 0.0111\nTime elapsed: 1.281s\n-------------------------------------------------\nTime: 1.618s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 4.225s\n-------------------------------------\nExtraTrees   --&gt; f1: 0.5709 \u00b1 0.0198 ~\nRandomForest --&gt; f1: 0.5802 \u00b1 0.0111 ~ !\n</pre> In\u00a0[7]: Copied! <pre># Let's have a look at the final results\natom.results\n</pre> # Let's have a look at the final results atom.results Out[7]: f1_train f1_test time_fit f1_bootstrap time_bootstrap time ET 0.8503 0.5688 1.414043 0.570892 1.019728 2.433771 RF 0.8552 0.5612 0.336765 0.580178 1.281000 1.617765 In\u00a0[8]: Copied! <pre># Visualize the bootstrap results\natom.plot_results(title=\"RF vs ET performance\")\n</pre> # Visualize the bootstrap results atom.plot_results(title=\"RF vs ET performance\") In\u00a0[9]: Copied! <pre># Print the results of some common metrics\natom.evaluate()\n</pre> # Print the results of some common metrics atom.evaluate() Out[9]: accuracy ap ba f1 jaccard mcc precision recall auc ET 0.8478 0.6904 0.7059 0.5688 0.3974 0.5108 0.7750 0.4493 0.8561 RF 0.8405 0.6775 0.7038 0.5612 0.3901 0.4891 0.7283 0.4565 0.8502 In\u00a0[10]: Copied! <pre># The winner attribute calls the best model (atom.winner == atom.rf)\nprint(f\"The winner is the {atom.winner.name} model!!\")\n</pre> # The winner attribute calls the best model (atom.winner == atom.rf) print(f\"The winner is the {atom.winner.name} model!!\") <pre>The winner is the RF model!!\n</pre> In\u00a0[11]: Copied! <pre># Visualize the distribution of predicted probabilities\natom.winner.plot_probabilities()\n</pre> # Visualize the distribution of predicted probabilities atom.winner.plot_probabilities() In\u00a0[12]: Copied! <pre># Compare how different metrics perform for different thresholds\natom.winner.plot_threshold(metric=[\"f1\", \"accuracy\", \"ap\"], steps=50)\n</pre> # Compare how different metrics perform for different thresholds atom.winner.plot_threshold(metric=[\"f1\", \"accuracy\", \"ap\"], steps=50)"}, {"location": "examples/binary_classification/#example-binary-classification", "title": "Example: Binary classification\u00b6", "text": "<p>This example shows how to use ATOM to solve a binary classification problem. Additonnaly, we'll perform a variety of data cleaning steps to prepare the data for modeling.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/binary_classification/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/binary_classification/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/binary_classification/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/calibration/", "title": "Calibration", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load the data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X, \"RainTomorrow\", n_rows=1e4, verbose=1, warnings=False)\n\n# Apply data cleaning steps\natom.clean()\natom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")\natom.encode(strategy=\"target\", max_onehot=5, infrequent_to_value=0.05)\n\n# Train a linear SVM\natom.run(\"gnb\")\n</pre> atom = ATOMClassifier(X, \"RainTomorrow\", n_rows=1e4, verbose=1, warnings=False)  # Apply data cleaning steps atom.clean() atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\") atom.encode(strategy=\"target\", max_onehot=5, infrequent_to_value=0.05)  # Train a linear SVM atom.run(\"gnb\") <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (10000, 22)\nTrain set size: 8000\nTest set size: 2000\n-------------------------------------\nMemory: 1.76 MB\nScaled: False\nMissing values: 22184 (10.1%)\nCategorical features: 5 (23.8%)\n\nFitting Cleaner...\nCleaning the data...\nFitting Imputer...\nImputing missing values...\nFitting Encoder...\nEncoding categorical columns...\n\nTraining ========================= &gt;&gt;\nModels: GNB\nMetric: f1\n\n\nResults for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5807\nTest evaluation --&gt; f1: 0.5971\nTime elapsed: 0.094s\n-------------------------------------------------\nTime: 0.094s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.160s\n-------------------------------------\nGaussianNB --&gt; f1: 0.5971\n</pre> In\u00a0[4]: Copied! <pre># Check the model's calibration\natom.plot_calibration()\n</pre> # Check the model's calibration atom.plot_calibration() In\u00a0[5]: Copied! <pre># Let's try to improve it using the calibrate method\natom.winner.calibrate(method=\"isotonic\", cv=5)\n</pre> # Let's try to improve it using the calibrate method atom.winner.calibrate(method=\"isotonic\", cv=5) <pre>Results for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5034\nTest evaluation --&gt; f1: 0.5061\nTime elapsed: 0.282s\n</pre> In\u00a0[6]: Copied! <pre># And check again...\natom.plot_calibration()\n</pre> # And check again... atom.plot_calibration()"}, {"location": "examples/calibration/#example-calibration", "title": "Example: Calibration\u00b6", "text": "<p>This example shows how to calibrate a classifier through atom.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/calibration/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/calibration/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/calibration/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/deep_learning/", "title": "Deep learning", "text": "In\u00a0[1]: Copied! <pre># Disable annoying tf warnings\nimport os\nos.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"\n\nfrom tensorflow import get_logger\nget_logger().setLevel('ERROR')\n\nimport absl.logging\nabsl.logging.set_verbosity(absl.logging.ERROR)\n\nfrom atom import ATOMClassifier, ATOMModel\nfrom sklearn.preprocessing import FunctionTransformer\nfrom optuna.pruners import PatientPruner\nfrom optuna.distributions import CategoricalDistribution, IntDistribution\n\nfrom scikeras.wrappers import KerasClassifier\nfrom keras.datasets import mnist\nfrom keras.models import Sequential\nfrom keras.layers import Dense, Flatten, Conv2D, Dropout\n</pre> # Disable annoying tf warnings import os os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"  from tensorflow import get_logger get_logger().setLevel('ERROR')  import absl.logging absl.logging.set_verbosity(absl.logging.ERROR)  from atom import ATOMClassifier, ATOMModel from sklearn.preprocessing import FunctionTransformer from optuna.pruners import PatientPruner from optuna.distributions import CategoricalDistribution, IntDistribution  from scikeras.wrappers import KerasClassifier from keras.datasets import mnist from keras.models import Sequential from keras.layers import Dense, Flatten, Conv2D, Dropout In\u00a0[2]: Copied! <pre># Download the MNIST dataset\n(X_train, y_train), (X_test, y_test) = mnist.load_data()\n\n# Flatten data to follow sklearn's API (2d input)\nX_train = X_train.reshape(len(X_train), -1)\nX_test = X_test.reshape(len(X_test), -1)\n\ndata = (X_train, y_train), (X_test, y_test)\n</pre> # Download the MNIST dataset (X_train, y_train), (X_test, y_test) = mnist.load_data()  # Flatten data to follow sklearn's API (2d input) X_train = X_train.reshape(len(X_train), -1) X_test = X_test.reshape(len(X_test), -1)  data = (X_train, y_train), (X_test, y_test) In\u00a0[3]: Copied! <pre># Create the convolutional neural network\nclass ConvNN(KerasClassifier):\n    \"\"\"Convolutional neural network model.\"\"\"\n\n    @property\n    def feature_encoder(self):\n        \"\"\"Convert the 2d input to the image's format (len(X), 28, 28, 1).\"\"\"\n        return FunctionTransformer(\n            func=lambda X: X.reshape(X.shape[0], 28, 28, 1),\n        )\n\n    @staticmethod\n    def _keras_build_fn(**kwargs):\n        \"\"\"Create the model's architecture.\"\"\"\n        model = Sequential()\n        model.add(\n            Conv2D(\n                filters=8,\n                kernel_size=3,\n                activation=\"relu\",\n                input_shape=(28, 28, 1),\n            )\n        )\n        model.add(Conv2D(filters=4, kernel_size=5, activation=\"relu\"))\n        model.add(Flatten())\n        model.add(Dense(units=10, activation=\"softmax\"))\n        model.compile(\n            optimizer=\"adam\",\n            loss=\"sparse_categorical_crossentropy\",\n        )\n\n        return model\n</pre> # Create the convolutional neural network class ConvNN(KerasClassifier):     \"\"\"Convolutional neural network model.\"\"\"      @property     def feature_encoder(self):         \"\"\"Convert the 2d input to the image's format (len(X), 28, 28, 1).\"\"\"         return FunctionTransformer(             func=lambda X: X.reshape(X.shape[0], 28, 28, 1),         )      @staticmethod     def _keras_build_fn(**kwargs):         \"\"\"Create the model's architecture.\"\"\"         model = Sequential()         model.add(             Conv2D(                 filters=8,                 kernel_size=3,                 activation=\"relu\",                 input_shape=(28, 28, 1),             )         )         model.add(Conv2D(filters=4, kernel_size=5, activation=\"relu\"))         model.add(Flatten())         model.add(Dense(units=10, activation=\"softmax\"))         model.compile(             optimizer=\"adam\",             loss=\"sparse_categorical_crossentropy\",         )          return model In\u00a0[4]: Copied! <pre># Convert the model to an ATOM model\nmodel = ATOMModel(\n    estimator=ConvNN(verbose=0),\n    acronym=\"CNN\",\n    needs_scaling=True,  # Applies automated feature scaling before fitting\n    has_validation=\"epochs\",  # Applies in-training validation on parameter epochs\n)\n</pre> # Convert the model to an ATOM model model = ATOMModel(     estimator=ConvNN(verbose=0),     acronym=\"CNN\",     needs_scaling=True,  # Applies automated feature scaling before fitting     has_validation=\"epochs\",  # Applies in-training validation on parameter epochs ) In\u00a0[5]: Copied! <pre>atom = ATOMClassifier(*data, n_rows=0.1, verbose=2, random_state=1)\n</pre> atom = ATOMClassifier(*data, n_rows=0.1, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Multiclass classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (7000, 785)\nTrain set size: 6000\nTest set size: 1000\n-------------------------------------\nMemory: 5.54 MB\nScaled: False\nOutlier values: 41839 (0.9%)\n\n</pre> In\u00a0[6]: Copied! <pre># Like any other model, we can define custom distributions for hyperparameter tuning\natom.run(\n    models=model,\n    metric=\"f1_weighted\",\n    n_trials=12,\n    ht_params={\n        \"distributions\": {\n            \"epochs\": IntDistribution(2, 10),\n            \"batch_size\": CategoricalDistribution([128, 256, 512]),\n        },\n    }\n)\n</pre> # Like any other model, we can define custom distributions for hyperparameter tuning atom.run(     models=model,     metric=\"f1_weighted\",     n_trials=12,     ht_params={         \"distributions\": {             \"epochs\": IntDistribution(2, 10),             \"batch_size\": CategoricalDistribution([128, 256, 512]),         },     } ) <pre>\nTraining ========================= &gt;&gt;\nModels: CNN\nMetric: f1_weighted\n\n\nRunning hyperparameter tuning for ConvNN...\n| trial |  epochs | batch_size | f1_weighted | best_f1_weighted | time_trial | time_ht |    state |\n| ----- | ------- | ---------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |       5 |        128 |      0.9147 |           0.9147 |     9.127s |  9.127s | COMPLETE |\n| 1     |       3 |        512 |      0.8539 |           0.9147 |     4.995s | 14.122s | COMPLETE |\n| 2     |       5 |        512 |      0.8931 |           0.9147 |     7.712s | 21.834s | COMPLETE |\n| 3     |       3 |        128 |       0.901 |           0.9147 |     5.706s | 27.540s | COMPLETE |\n| 4     |       5 |        128 |      0.9147 |           0.9147 |     0.607s | 28.147s | COMPLETE |\n| 5     |       9 |        128 |      0.9251 |           0.9251 |    15.297s | 43.443s | COMPLETE |\n| 6     |       9 |        128 |      0.9251 |           0.9251 |     1.230s | 44.673s | COMPLETE |\n| 7     |       3 |        128 |       0.901 |           0.9251 |     0.636s | 45.309s | COMPLETE |\n| 8     |      10 |        256 |      0.8131 |           0.9251 |     2.573s | 47.882s |   PRUNED |\n| 9     |       8 |        128 |      0.9191 |           0.9251 |    14.014s | 01m:02s |   PRUNED |\n| 10    |       7 |        256 |       0.836 |           0.9251 |     2.498s | 01m:04s |   PRUNED |\n| 11    |      10 |        128 |      0.9431 |           0.9431 |    16.725s | 01m:21s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 11\nBest parameters:\n --&gt; epochs: 10\n --&gt; batch_size: 128\nBest evaluation --&gt; f1_weighted: 0.9431\nTime elapsed: 01m:21s\nFit ---------------------------------------------\nTrain evaluation --&gt; f1_weighted: 0.9835\nTest evaluation --&gt; f1_weighted: 0.952\nTime elapsed: 28.600s\n-------------------------------------------------\nTime: 01m:50s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 03m:39s\n-------------------------------------\nConvNN --&gt; f1_weighted: 0.952\n</pre> In\u00a0[7]: Copied! <pre>atom.cnn.trials\n</pre> atom.cnn.trials Out[7]: epochs batch_size estimator f1_weighted best_f1_weighted time_trial time_ht state trial 0 5 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.914748 0.943121 9.126504 9.126504 COMPLETE 1 3 512 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.853919 0.943121 4.995052 14.121556 COMPLETE 2 5 512 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.893128 0.943121 7.712461 21.834017 COMPLETE 3 3 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.900996 0.943121 5.705581 27.539598 COMPLETE 4 5 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.914748 0.943121 0.607057 28.146655 COMPLETE 5 9 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.925088 0.943121 15.296670 43.443325 COMPLETE 6 9 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.925088 0.943121 1.229779 44.673104 COMPLETE 7 3 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.900996 0.943121 0.635578 45.308682 COMPLETE 8 10 256 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.813073 0.943121 2.573343 47.882025 PRUNED 9 8 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.919095 0.943121 14.014060 61.896085 PRUNED 10 7 256 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.835966 0.943121 2.498169 64.394254 PRUNED 11 10 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.943121 0.943121 16.725048 81.119302 COMPLETE In\u00a0[8]: Copied! <pre>atom.plot_evals(dataset=\"test+train\")\n</pre> atom.plot_evals(dataset=\"test+train\") In\u00a0[9]: Copied! <pre># Use the prediction methods like any other model\natom.cnn.predict_proba(X_train)\n</pre> # Use the prediction methods like any other model atom.cnn.predict_proba(X_train) Out[9]: 0 1 2 3 4 5 6 7 8 9 0 6.981344e-08 1.163047e-08 1.302092e-07 7.298404e-01 4.980663e-11 2.701415e-01 6.764501e-11 1.982446e-06 5.807213e-07 1.532895e-05 1 9.999958e-01 2.160013e-12 2.527803e-06 1.498349e-07 2.094386e-09 4.418725e-07 6.460270e-07 2.255171e-07 2.042284e-08 7.188346e-08 2 1.154879e-10 2.405690e-10 1.185454e-07 3.165163e-07 9.995613e-01 1.887145e-11 6.159564e-12 4.155245e-04 1.546579e-09 2.274483e-05 3 5.565947e-07 9.992028e-01 6.758810e-04 3.334095e-06 2.312364e-05 9.298934e-08 1.309337e-07 7.859311e-05 1.515798e-05 3.681653e-07 4 4.683458e-09 4.092270e-08 3.246872e-07 1.020155e-06 2.804452e-03 9.423515e-08 3.789635e-12 8.406813e-03 7.883451e-05 9.887084e-01 ... ... ... ... ... ... ... ... ... ... ... 59995 7.329114e-09 4.127999e-08 3.695257e-06 1.461548e-04 1.231008e-09 6.157245e-06 2.624072e-11 8.209722e-09 9.998319e-01 1.199038e-05 59996 6.239399e-08 2.397851e-09 1.575265e-03 9.643788e-01 8.514269e-08 1.101398e-04 1.774388e-10 1.135693e-07 3.362476e-02 3.106496e-04 59997 7.059591e-10 5.808693e-09 1.657147e-11 3.829917e-05 3.490374e-07 9.998387e-01 4.054391e-11 4.646493e-11 1.087904e-04 1.385001e-05 59998 1.183419e-05 2.104532e-09 1.940764e-06 1.050059e-07 8.195059e-06 5.124656e-06 9.999721e-01 4.185512e-09 7.723169e-07 1.096977e-09 59999 3.987676e-04 1.140556e-06 4.448286e-04 4.279935e-06 1.410985e-07 2.539659e-03 8.256741e-08 8.921248e-08 9.958331e-01 7.779775e-04 <p>60000 rows \u00d7 10 columns</p> In\u00a0[10]: Copied! <pre># Or make plots...\natom.cnn.plot_hyperparameters()\n</pre> # Or make plots... atom.cnn.plot_hyperparameters() In\u00a0[11]: Copied! <pre>atom.plot_parallel_coordinate()\n</pre> atom.plot_parallel_coordinate()"}, {"location": "examples/deep_learning/#example-deep-learning", "title": "Example: Deep learning\u00b6", "text": "<p>This example shows how to use ATOM to train and validate a Convolutional Neural Network implemented with Keras using scikeras.</p> <p>Import the MNIST dataset from keras.datasets. This is a well known image dataset whose goal is to classify handwritten digits.</p>"}, {"location": "examples/deep_learning/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/deep_learning/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/deep_learning/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/ensembles/", "title": "Ensembles", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom atom import ATOMClassifier\n</pre> # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n</pre> # Load the data X, y = load_breast_cancer(return_X_y=True, as_frame=True) In\u00a0[3]: Copied! <pre># Initialize atom and train several models\natom = ATOMClassifier(X, y, verbose=2, random_state=1)\natom.run(models=[\"LR\", \"Tree\", \"LGB\"], metric=\"accuracy\")\n</pre> # Initialize atom and train several models atom = ATOMClassifier(X, y, verbose=2, random_state=1) atom.run(models=[\"LR\", \"Tree\", \"LGB\"], metric=\"accuracy\") <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n\nTraining ========================= &gt;&gt;\nModels: LR, Tree, LGB\nMetric: accuracy\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; accuracy: 0.989\nTest evaluation --&gt; accuracy: 0.9823\nTime elapsed: 0.048s\n-------------------------------------------------\nTime: 0.048s\n\n\nResults for DecisionTree:\nFit ---------------------------------------------\nTrain evaluation --&gt; accuracy: 1.0\nTest evaluation --&gt; accuracy: 0.9469\nTime elapsed: 0.042s\n-------------------------------------------------\nTime: 0.042s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; accuracy: 1.0\nTest evaluation --&gt; accuracy: 0.9469\nTime elapsed: 0.246s\n-------------------------------------------------\nTime: 0.246s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.419s\n-------------------------------------\nLogisticRegression --&gt; accuracy: 0.9823 !\nDecisionTree       --&gt; accuracy: 0.9469\nLightGBM           --&gt; accuracy: 0.9469\n</pre> In\u00a0[4]: Copied! <pre># Combine the models into a Voting model\natom.voting(voting=\"soft\")\n</pre> # Combine the models into a Voting model atom.voting(voting=\"soft\") <pre>Results for Voting:\nFit ---------------------------------------------\nTrain evaluation --&gt; accuracy: 1.0\nTest evaluation --&gt; accuracy: 0.9469\nTime elapsed: 0.055s\n</pre> In\u00a0[5]: Copied! <pre># Note that we now have an extra model in the pipeline\natom.models\n</pre> # Note that we now have an extra model in the pipeline atom.models Out[5]: <pre>['LR', 'Tree', 'LGB', 'Vote']</pre> In\u00a0[6]: Copied! <pre># The plot_pipeline method helps us visualize the ensemble\natom.plot_pipeline()\n</pre> # The plot_pipeline method helps us visualize the ensemble atom.plot_pipeline() In\u00a0[7]: Copied! <pre># The Vote model averages the scores of the models it contains\natom.vote\n</pre> # The Vote model averages the scores of the models it contains atom.vote Out[7]: <pre>Voting()</pre> In\u00a0[8]: Copied! <pre># We can use it like any other model to make predictions or plots\natom.vote.predict_proba(range(10))\n</pre> # We can use it like any other model to make predictions or plots atom.vote.predict_proba(range(10)) Out[8]: 0 1 0 0.961516 0.038484 1 0.999968 0.000032 2 0.998743 0.001257 3 0.968071 0.031929 4 0.000014 0.999986 5 0.999991 0.000009 6 0.000019 0.999981 7 0.000015 0.999985 8 0.000026 0.999974 9 0.002627 0.997373 In\u00a0[9]: Copied! <pre>atom.vote.plot_threshold(metric=[\"auc\", \"recall\", \"accuracy\"])\n</pre> atom.vote.plot_threshold(metric=[\"auc\", \"recall\", \"accuracy\"]) In\u00a0[10]: Copied! <pre>atom.plot_results(legend=None)\n</pre> atom.plot_results(legend=None) In\u00a0[11]: Copied! <pre>atom.delete(\"vote\")\n</pre> atom.delete(\"vote\") <pre>Deleting 1 models...\n --&gt; Model Vote successfully deleted.\n</pre> In\u00a0[12]: Copied! <pre># Just like Voting, we can create a Stacking model\natom.stacking(final_estimator=\"LDA\")\n</pre> # Just like Voting, we can create a Stacking model atom.stacking(final_estimator=\"LDA\") <pre>Results for Stacking:\nFit ---------------------------------------------\nTrain evaluation --&gt; accuracy: 0.9934\nTest evaluation --&gt; accuracy: 0.9823\nTime elapsed: 0.728s\n</pre> In\u00a0[13]: Copied! <pre># The final estimator uses the predictions of the underlying models\natom.stack.head()\n</pre> # The final estimator uses the predictions of the underlying models atom.stack.head() Out[13]: mean radius mean texture mean perimeter mean area mean smoothness mean compactness mean concavity mean concave points mean symmetry mean fractal dimension ... worst texture worst perimeter worst area worst smoothness worst compactness worst concavity worst concave points worst symmetry worst fractal dimension target 0 13.48 20.82 88.40 559.2 0.10160 0.12550 0.10630 0.05439 0.1720 0.06419 ... 26.02 107.30 740.4 0.1610 0.42250 0.5030 0.22580 0.2807 0.10710 0 1 18.31 20.58 120.80 1052.0 0.10680 0.12480 0.15690 0.09451 0.1860 0.05941 ... 26.20 142.20 1493.0 0.1492 0.25360 0.3759 0.15100 0.3074 0.07863 0 2 17.93 24.48 115.20 998.9 0.08855 0.07027 0.05699 0.04744 0.1538 0.05510 ... 34.69 135.10 1320.0 0.1315 0.18060 0.2080 0.11360 0.2504 0.07948 0 3 15.13 29.81 96.71 719.5 0.08320 0.04605 0.04686 0.02739 0.1852 0.05294 ... 36.91 110.10 931.4 0.1148 0.09866 0.1547 0.06575 0.3233 0.06165 0 4 8.95 15.76 58.74 245.2 0.09462 0.12430 0.09263 0.02308 0.1305 0.07163 ... 17.07 63.34 270.0 0.1179 0.18790 0.1544 0.03846 0.1652 0.07722 1 <p>5 rows \u00d7 31 columns</p> In\u00a0[14]: Copied! <pre># Again, the model can be used for predictions or plots\natom.stack.predict(X)\n</pre> # Again, the model can be used for predictions or plots atom.stack.predict(X) Out[14]: <pre>0      0\n1      0\n2      0\n3      0\n4      1\n      ..\n564    1\n565    0\n566    0\n567    0\n568    1\nName: target, Length: 569, dtype: int64</pre> In\u00a0[15]: Copied! <pre>atom.stack.plot_shap_beeswarm(show=10)\n</pre> atom.stack.plot_shap_beeswarm(show=10) <pre>PermutationExplainer explainer: 114it [00:48,  2.01it/s]                                                                                                                                                                                                                                                             \n</pre>"}, {"location": "examples/ensembles/#example-ensembles", "title": "Example: Ensembles\u00b6", "text": "<p>This example shows how to use atom's ensemble techniques to improve predictions on a dataset combining several models.</p> <p>Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.</p>"}, {"location": "examples/ensembles/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/ensembles/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/ensembles/#voting", "title": "Voting\u00b6", "text": ""}, {"location": "examples/ensembles/#stacking", "title": "Stacking\u00b6", "text": ""}, {"location": "examples/feature_engineering/", "title": "Feature engineering", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre># Initialize atom and apply data cleaning\natom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0)\natom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8)\natom.encode(max_onehot=10, infrequent_to_value=0.04)\n</pre> # Initialize atom and apply data cleaning atom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0) atom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8) atom.encode(max_onehot=10, infrequent_to_value=0.04) In\u00a0[4]: Copied! <pre>atom.verbose = 2  # Increase verbosity to see the output\n\n# Let's see how a LightGBM model performs\natom.run('LGB', metric='auc')\n</pre> atom.verbose = 2  # Increase verbosity to see the output  # Let's see how a LightGBM model performs atom.run('LGB', metric='auc') <pre>\nTraining ========================= &gt;&gt;\nModels: LGB\nMetric: auc\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.9817\nTest evaluation --&gt; auc: 0.8584\nTime elapsed: 0.831s\n-------------------------------------------------\nTime: 0.831s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.963s\n-------------------------------------\nLightGBM --&gt; auc: 0.8584\n</pre> In\u00a0[5]: Copied! <pre># Since we are going to compare different datasets,\n# we need to create separate branches\natom.branch = \"dfs\"\n</pre> # Since we are going to compare different datasets, # we need to create separate branches atom.branch = \"dfs\" <pre>Successfully created new branch: dfs.\n</pre> In\u00a0[6]: Copied! <pre># Create 50 new features using dfs\natom.feature_generation(\"dfs\", n_features=50, operators=[\"add\", \"sub\", \"log\"])\n</pre> # Create 50 new features using dfs atom.feature_generation(\"dfs\", n_features=50, operators=[\"add\", \"sub\", \"log\"]) <pre>Fitting FeatureGenerator...\nGenerating new features...\n --&gt; 50 new features were added.\n</pre> In\u00a0[7]: Copied! <pre># The warnings warn us that some operators created missing values!\n# We can see the columns with missing values using the nans attribute\natom.nans\n</pre> # The warnings warn us that some operators created missing values! # We can see the columns with missing values using the nans attribute atom.nans Out[7]: <pre>Location                       0\nMinTemp                        0\nMaxTemp                        0\nRainfall                       0\nEvaporation                    0\n                              ..\nTemp9am - WindDir3pm           0\nWindDir9am + WindGustSpeed     0\nWindDir9am + WindSpeed3pm      0\nWindGustDir + WindSpeed9am     0\nWindSpeed3pm - WindSpeed9am    0\nLength: 73, dtype: int64</pre> In\u00a0[8]: Copied! <pre># Turn off warnings in the future\natom.warnings = False\n\n# Impute the data again to get rid of the missing values\natom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8)\n</pre> # Turn off warnings in the future atom.warnings = False  # Impute the data again to get rid of the missing values atom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8) <pre>Fitting Imputer...\nImputing missing values...\n --&gt; Imputing 12 missing values using the KNN imputer in feature NATURAL_LOGARITHM(Temp3pm).\n</pre> In\u00a0[9]: Copied! <pre># 50 new features may be to much...\n# Let's check for multicollinearity and use rfecv to reduce the number\natom.feature_selection(\n    strategy=\"rfecv\",\n    solver=\"LGB\",\n    n_features=30,\n    scoring=\"auc\",\n    max_correlation=0.98,\n)\n</pre> # 50 new features may be to much... # Let's check for multicollinearity and use rfecv to reduce the number atom.feature_selection(     strategy=\"rfecv\",     solver=\"LGB\",     n_features=30,     scoring=\"auc\",     max_correlation=0.98, ) <pre>Fitting FeatureSelector...\nPerforming feature selection ...\n --&gt; Feature MinTemp was removed due to collinearity with another feature.\n --&gt; Feature MinTemp + RainToday_No was removed due to collinearity with another feature.\n --&gt; Feature MaxTemp was removed due to collinearity with another feature.\n --&gt; Feature MaxTemp + WindDir3pm was removed due to collinearity with another feature.\n --&gt; Feature MaxTemp + WindGustDir was removed due to collinearity with another feature.\n --&gt; Feature Rainfall was removed due to collinearity with another feature.\n --&gt; Feature Rainfall + RainToday_rare was removed due to collinearity with another feature.\n --&gt; Feature Rainfall + WindDir3pm was removed due to collinearity with another feature.\n --&gt; Feature Sunshine was removed due to collinearity with another feature.\n --&gt; Feature Sunshine - WindDir3pm was removed due to collinearity with another feature.\n --&gt; Feature WindGustSpeed was removed due to collinearity with another feature.\n --&gt; Feature WindSpeed9am was removed due to collinearity with another feature.\n --&gt; Feature WindSpeed3pm was removed due to collinearity with another feature.\n --&gt; Feature Humidity9am was removed due to collinearity with another feature.\n --&gt; Feature Humidity3pm was removed due to collinearity with another feature.\n --&gt; Feature NATURAL_LOGARITHM(Pressure3pm) was removed due to collinearity with another feature.\n --&gt; Feature Pressure3pm - RainToday_Yes was removed due to collinearity with another feature.\n --&gt; Feature Cloud9am + RainToday_No was removed due to collinearity with another feature.\n --&gt; Feature Cloud3pm was removed due to collinearity with another feature.\n --&gt; Feature Cloud3pm + Location was removed due to collinearity with another feature.\n --&gt; Feature Temp9am - WindDir3pm was removed due to collinearity with another feature.\n --&gt; Feature Temp3pm was removed due to collinearity with another feature.\n --&gt; Feature Temp3pm - WindDir9am was removed due to collinearity with another feature.\n --&gt; Feature RainToday_rare was removed due to collinearity with another feature.\n --&gt; rfecv selected 38 features from the dataset.\n   --&gt; Dropping feature Location (rank 12).\n   --&gt; Dropping feature Cloud9am (rank 2).\n   --&gt; Dropping feature RainToday_No (rank 10).\n   --&gt; Dropping feature RainToday_Yes (rank 11).\n   --&gt; Dropping feature Location + RainToday_rare (rank 9).\n   --&gt; Dropping feature Location - Pressure9am (rank 4).\n   --&gt; Dropping feature Location - Temp9am (rank 7).\n   --&gt; Dropping feature Location - WindGustDir (rank 8).\n   --&gt; Dropping feature RainToday_No - WindSpeed3pm (rank 3).\n   --&gt; Dropping feature RainToday_rare + Temp3pm (rank 5).\n   --&gt; Dropping feature Rainfall + RainToday_Yes (rank 6).\n</pre> In\u00a0[10]: Copied! <pre># The collinear attribute shows what features were removed due to multicollinearity\natom.collinear_\n</pre> # The collinear attribute shows what features were removed due to multicollinearity atom.collinear_ Out[10]: drop corr_feature corr_value 0 MinTemp MinTemp + RainToday_No, MinTemp + RainToday_Yes 0.9978, 0.9979 1 MinTemp + RainToday_No MinTemp, MinTemp + RainToday_Yes 0.9978, 0.9914 2 MaxTemp MaxTemp + WindDir3pm, MaxTemp + WindDir9am, Ma... 1.0, 1.0, 1.0 3 MaxTemp + WindDir3pm MaxTemp, MaxTemp + WindDir9am, MaxTemp + WindG... 1.0, 1.0, 1.0 4 MaxTemp + WindGustDir MaxTemp, MaxTemp + WindDir3pm, MaxTemp + WindD... 1.0, 1.0, 1.0 5 Rainfall Rainfall + RainToday_Yes, Rainfall + RainToday... 0.999, 0.9999, 1.0 6 Rainfall + RainToday_rare Rainfall, Rainfall + RainToday_Yes, Rainfall +... 0.9999, 0.9989, 0.9999 7 Rainfall + WindDir3pm Rainfall, Rainfall + RainToday_Yes, Rainfall +... 1.0, 0.999, 0.9999 8 Sunshine RainToday_rare + Sunshine, Sunshine - WindDir3pm 0.9994, 0.9998 9 Sunshine - WindDir3pm Sunshine, RainToday_rare + Sunshine 0.9998, 0.9993 10 WindGustSpeed WindDir9am + WindGustSpeed 1.0 11 WindSpeed9am WindGustDir + WindSpeed9am 1.0 12 WindSpeed3pm WindDir9am + WindSpeed3pm 1.0 13 Humidity9am Humidity9am + WindGustDir 1.0 14 Humidity3pm Humidity3pm - Sunshine 0.9937 15 NATURAL_LOGARITHM(Pressure3pm) Pressure3pm, Pressure3pm - RainToday_Yes 1.0, 0.9981 16 Pressure3pm - RainToday_Yes Pressure3pm, NATURAL_LOGARITHM(Pressure3pm) 0.9981, 0.9981 17 Cloud9am + RainToday_No Cloud9am 0.9828 18 Cloud3pm Cloud3pm + Location, Cloud3pm + RainToday_rare 1.0, 0.9991 19 Cloud3pm + Location Cloud3pm, Cloud3pm + RainToday_rare 1.0, 0.9991 20 Temp9am - WindDir3pm Temp9am 1.0 21 Temp3pm RainToday_rare + Temp3pm, Temp3pm - WindDir9am 0.9999, 1.0 22 Temp3pm - WindDir9am Temp3pm, RainToday_rare + Temp3pm 1.0, 0.9999 23 RainToday_rare Location + RainToday_rare 1.0 In\u00a0[11]: Copied! <pre># After applying rfecv, we can plot the score per number of features\natom.plot_rfecv()\n</pre> # After applying rfecv, we can plot the score per number of features atom.plot_rfecv() In\u00a0[12]: Copied! <pre># Let's see how the model performs now\n# Add a tag to the model's acronym to not overwrite previous LGB\natom.run(\"LGB_dfs\", errors=\"raise\")\n</pre> # Let's see how the model performs now # Add a tag to the model's acronym to not overwrite previous LGB atom.run(\"LGB_dfs\", errors=\"raise\") <pre>\nTraining ========================= &gt;&gt;\nModels: LGB_dfs\nMetric: auc\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.9893\nTest evaluation --&gt; auc: 0.8572\nTime elapsed: 1.045s\n-------------------------------------------------\nTime: 1.045s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.186s\n-------------------------------------\nLightGBM --&gt; auc: 0.8572\n</pre> In\u00a0[13]: Copied! <pre># Create another branch for the genetic features\n# Split form master to avoid the dfs features\natom.branch = \"gfg_from_main\"\n</pre> # Create another branch for the genetic features # Split form master to avoid the dfs features atom.branch = \"gfg_from_main\" <pre>Successfully created new branch: gfg.\n</pre> In\u00a0[14]: Copied! <pre># Create new features using Genetic Programming\natom.feature_generation(strategy='gfg', n_features=20)\n</pre> # Create new features using Genetic Programming atom.feature_generation(strategy='gfg', n_features=20) <pre>Fitting FeatureGenerator...\n    |   Population Average    |             Best Individual              |\n---- ------------------------- ------------------------------------------ ----------\n Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left\n   0     3.08         0.137852        3         0.505879              N/A     18.62s\n   1     3.30         0.332951        6         0.506041              N/A     19.23s\n   2     3.92         0.429317        7         0.525775              N/A     18.31s\n   3     4.64         0.459817        9         0.532823              N/A     16.25s\n   4     6.59         0.475058       11         0.540078              N/A     15.51s\n   5     8.04         0.498345       13          0.54114              N/A     14.56s\n   6     9.80         0.509423       13         0.543911              N/A     13.87s\n   7    10.86         0.513225       15         0.551242              N/A     13.28s\n   8    11.54         0.513973       15         0.554127              N/A     11.99s\n   9    12.21         0.516725       19         0.554172              N/A     11.44s\n  10    13.09         0.520543       17         0.556923              N/A     10.19s\n  11    13.24         0.519283       17         0.556923              N/A      9.07s\n  12    12.74          0.51949       21         0.558114              N/A      7.95s\n  13    13.88         0.521709       21         0.558114              N/A      6.68s\n  14    15.99         0.523381       19         0.558673              N/A      6.12s\n  15    16.74         0.523708       19         0.558673              N/A      7.97s\n  16    16.84         0.524509       19         0.560449              N/A      6.02s\n  17    16.79         0.525061       19         0.560449              N/A      2.26s\n  18    16.77         0.523639       21         0.561281              N/A      1.11s\n  19    17.03         0.524261       23         0.561813              N/A      0.00s\nGenerating new features...\n --&gt; 20 new features were added.\n</pre> In\u00a0[16]: Copied! <pre># We can see the feature's fitness and description through the genetic_features attribute\natom.genetic_features_\n</pre> # We can see the feature's fitness and description through the genetic_features attribute atom.genetic_features_ Out[16]: name description fitness 0 x23 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.541449 1 x24 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.541449 2 x25 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.541449 3 x26 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.541449 4 x27 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.541449 5 x28 mul(add(Cloud3pm, add(Cloud3pm, mul(add(WindGu... 0.541322 6 x29 mul(add(Cloud3pm, mul(Humidity3pm, WindDir3pm)... 0.541229 7 x30 mul(add(Cloud3pm, mul(Humidity3pm, WindDir3pm)... 0.541229 8 x31 mul(add(Cloud3pm, mul(Humidity3pm, WindDir3pm)... 0.540696 9 x32 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540674 10 x33 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.540674 11 x34 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540674 12 x35 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540281 13 x36 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540281 14 x37 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.539923 15 x38 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.539923 16 x39 mul(add(WindGustSpeed, add(Humidity3pm, Rainfa... 0.539923 17 x40 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.539923 18 x41 mul(mul(add(Cloud3pm, add(Cloud3pm, mul(Humidi... 0.539923 19 x42 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.539909 In\u00a0[17]: Copied! <pre># Fit the model again\natom.run(\"LGB_gfg\", metric=\"auc\")\n</pre> # Fit the model again atom.run(\"LGB_gfg\", metric=\"auc\") <pre>\nTraining ========================= &gt;&gt;\nModels: LGB_gfg\nMetric: auc\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.9857\nTest evaluation --&gt; auc: 0.8558\nTime elapsed: 1.044s\n-------------------------------------------------\nTime: 1.044s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.227s\n-------------------------------------\nLightGBM --&gt; auc: 0.8558\n</pre> In\u00a0[18]: Copied! <pre># Visualize the whole pipeline\natom.plot_pipeline()\n</pre> # Visualize the whole pipeline atom.plot_pipeline() In\u00a0[19]: Copied! <pre># Use atom's plots to compare the three models\natom.plot_roc(rows=\"test+train\")\n</pre> # Use atom's plots to compare the three models atom.plot_roc(rows=\"test+train\") In\u00a0[23]: Copied! <pre># To compare other plots it might be useful to use a canvas\nwith atom.canvas(1, 2, figsize=(1800, 800)):\n    atom.lgb_dfs.plot_roc(rows=\"test+train\")\n    atom.lgb_dfs.plot_feature_importance(show=10, title=\"LGB + dfs\")\n</pre> # To compare other plots it might be useful to use a canvas with atom.canvas(1, 2, figsize=(1800, 800)):     atom.lgb_dfs.plot_roc(rows=\"test+train\")     atom.lgb_dfs.plot_feature_importance(show=10, title=\"LGB + dfs\") In\u00a0[21]: Copied! <pre># We can check the feature importance with other plots as well\natom.plot_permutation_importance(models=[\"LGB_dfs\", \"LGB_gfg\"], show=12)\n</pre> # We can check the feature importance with other plots as well atom.plot_permutation_importance(models=[\"LGB_dfs\", \"LGB_gfg\"], show=12) In\u00a0[24]: Copied! <pre>atom.LGB_gfg.plot_shap_decision(rows=(0, 10), show=15)\n</pre> atom.LGB_gfg.plot_shap_decision(rows=(0, 10), show=15)"}, {"location": "examples/feature_engineering/#example-feature-engineering", "title": "Example: Feature engineering\u00b6", "text": "<p>This example shows how to use automated feature generation to improve a model's performance.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/feature_engineering/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/feature_engineering/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/feature_engineering/#deep-feature-synthesis", "title": "Deep Feature Synthesis\u00b6", "text": ""}, {"location": "examples/feature_engineering/#genetic-feature-generation", "title": "Genetic Feature Generation\u00b6", "text": ""}, {"location": "examples/feature_engineering/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/getting_started/", "title": "Getting started", "text": "In\u00a0[1]: Copied! <pre>import pandas as pd\nfrom atom import ATOMClassifier\n\n# Load the Australian Weather dataset\nX = pd.read_csv(\"https://raw.githubusercontent.com/tvdboom/ATOM/master/examples/datasets/weatherAUS.csv\")\n</pre> import pandas as pd from atom import ATOMClassifier  # Load the Australian Weather dataset X = pd.read_csv(\"https://raw.githubusercontent.com/tvdboom/ATOM/master/examples/datasets/weatherAUS.csv\") In\u00a0[2]: Copied! <pre>atom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=1000, verbose=2)\n</pre> atom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=1000, verbose=2) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (1000, 22)\nTrain set size: 800\nTest set size: 200\n-------------------------------------\nMemory: 176.13 kB\nScaled: False\nMissing values: 2260 (10.3%)\nCategorical features: 5 (23.8%)\n\n</pre> In\u00a0[3]: Copied! <pre>atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")  \natom.encode(strategy=\"Target\", max_onehot=8)\n</pre> atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")   atom.encode(strategy=\"Target\", max_onehot=8) <pre>Fitting Imputer...\nImputing missing values...\n --&gt; Imputing 8 missing values with median (11.6) in feature MinTemp.\n --&gt; Imputing 2 missing values with median (22.3) in feature MaxTemp.\n --&gt; Imputing 12 missing values with median (0.0) in feature Rainfall.\n --&gt; Imputing 425 missing values with median (4.8) in feature Evaporation.\n --&gt; Imputing 480 missing values with median (8.55) in feature Sunshine.\n --&gt; Imputing 59 missing values with most_frequent (N) in feature WindGustDir.\n --&gt; Imputing 59 missing values with median (37.0) in feature WindGustSpeed.\n --&gt; Imputing 90 missing values with most_frequent (N) in feature WindDir9am.\n --&gt; Imputing 28 missing values with most_frequent (SW) in feature WindDir3pm.\n --&gt; Imputing 10 missing values with median (13.0) in feature WindSpeed9am.\n --&gt; Imputing 19 missing values with median (17.0) in feature WindSpeed3pm.\n --&gt; Imputing 17 missing values with median (70.0) in feature Humidity9am.\n --&gt; Imputing 31 missing values with median (51.0) in feature Humidity3pm.\n --&gt; Imputing 89 missing values with median (1017.8) in feature Pressure9am.\n --&gt; Imputing 87 missing values with median (1015.2) in feature Pressure3pm.\n --&gt; Imputing 383 missing values with median (5.0) in feature Cloud9am.\n --&gt; Imputing 412 missing values with median (5.0) in feature Cloud3pm.\n --&gt; Imputing 11 missing values with median (16.5) in feature Temp9am.\n --&gt; Imputing 26 missing values with median (20.7) in feature Temp3pm.\n --&gt; Imputing 12 missing values with most_frequent (No) in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 49 classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n</pre> In\u00a0[4]: Copied! <pre>atom.run(models=[\"LDA\", \"AdaB\"], metric=\"auc\", n_trials=10)\n</pre> atom.run(models=[\"LDA\", \"AdaB\"], metric=\"auc\", n_trials=10) <pre>\nTraining ========================= &gt;&gt;\nModels: LDA, AdaB\nMetric: auc\n\n\nRunning hyperparameter tuning for LinearDiscriminantAnalysis...\n| trial |  solver | shrinkage |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |   eigen |       0.9 |  0.8807 |   0.8807 |     0.162s |  0.162s | COMPLETE |\n| 1     |     svd |       nan |  0.8445 |   0.8807 |     0.147s |  0.309s | COMPLETE |\n| 2     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.310s | COMPLETE |\n| 3     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.311s | COMPLETE |\n| 4     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.312s | COMPLETE |\n| 5     |   eigen |       0.9 |  0.8807 |   0.8807 |     0.000s |  0.312s | COMPLETE |\n| 6     |     svd |       nan |  0.8445 |   0.8807 |     0.000s |  0.312s | COMPLETE |\n| 7     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.313s | COMPLETE |\n| 8     |   eigen |       0.5 |  0.8417 |   0.8807 |     0.143s |  0.456s | COMPLETE |\n| 9     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.457s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 0\nBest parameters:\n --&gt; solver: eigen\n --&gt; shrinkage: 0.9\nBest evaluation --&gt; auc: 0.8807\nTime elapsed: 0.457s\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.8381\nTest evaluation --&gt; auc: 0.8037\nTime elapsed: 0.025s\n-------------------------------------------------\nTime: 0.482s\n\n\nRunning hyperparameter tuning for AdaBoost...\n| trial | n_estimators | learning_rate | algorithm |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------------ | ------------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |           90 |        0.4088 |   SAMME.R |  0.8002 |   0.8002 |     0.331s |  0.331s | COMPLETE |\n| 1     |          190 |        0.1019 |   SAMME.R |  0.8294 |   0.8294 |     0.540s |  0.871s | COMPLETE |\n| 2     |          260 |         0.243 |   SAMME.R |   0.754 |   0.8294 |     0.645s |  1.515s | COMPLETE |\n| 3     |          490 |         0.041 |   SAMME.R |  0.7953 |   0.8294 |     1.105s |  2.620s | COMPLETE |\n| 4     |          210 |        0.1604 |     SAMME |  0.7969 |   0.8294 |     0.527s |  3.148s | COMPLETE |\n| 5     |          310 |        0.1504 |     SAMME |  0.7988 |   0.8294 |     0.696s |  3.843s | COMPLETE |\n| 6     |          380 |         2.445 |     SAMME |  0.5978 |   0.8294 |     0.830s |  4.674s | COMPLETE |\n| 7     |          100 |        0.9151 |     SAMME |  0.8372 |   0.8372 |     0.328s |  5.002s | COMPLETE |\n| 8     |          350 |        8.9334 |     SAMME |  0.6751 |   0.8372 |     0.786s |  5.787s | COMPLETE |\n| 9     |          450 |        0.1974 |     SAMME |    0.82 |   0.8372 |     0.969s |  6.757s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 7\nBest parameters:\n --&gt; n_estimators: 100\n --&gt; learning_rate: 0.9151\n --&gt; algorithm: SAMME\nBest evaluation --&gt; auc: 0.8372\nTime elapsed: 6.757s\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.9133\nTest evaluation --&gt; auc: 0.8353\nTime elapsed: 0.232s\n-------------------------------------------------\nTime: 6.989s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 9.134s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; auc: 0.8037\nAdaBoost                   --&gt; auc: 0.8353 !\n</pre> In\u00a0[5]: Copied! <pre>atom.evaluate()\n</pre> atom.evaluate() Out[5]: accuracy ap ba f1 jaccard mcc precision recall auc LDA 0.785 0.5888 0.7533 0.5825 0.4110 0.4542 0.5000 0.6977 0.8037 AdaB 0.820 0.5801 0.7165 0.5610 0.3898 0.4490 0.5897 0.5349 0.8353"}, {"location": "examples/getting_started/#example-getting-started", "title": "Example: Getting started\u00b6", "text": "<p>This example shows how to get started with the atom-ml library.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/holdout_set/", "title": "Holdout set", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre># Initialize atom specifying a fraction of the dataset for holdout\natom = ATOMClassifier(X, n_rows=0.5, holdout_size=0.2, verbose=2)\n</pre> # Initialize atom specifying a fraction of the dataset for holdout atom = ATOMClassifier(X, n_rows=0.5, holdout_size=0.2, verbose=2) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (56877, 22)\nTrain set size: 42658\nTest set size: 14219\nHoldout set size: 14219\n-------------------------------------\nMemory: 10.01 MB\nScaled: False\nMissing values: 126822 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 15 (0.0%)\n\n</pre> In\u00a0[4]: Copied! <pre># The test and holdout fractions are split after subsampling the dataset\n# Also note that the holdout data set is not a part of atom's dataset\nprint(\"Length loaded data:\", len(X))\nprint(\"Length dataset + holdout:\", len(atom.dataset) + len(atom.holdout))\n</pre> # The test and holdout fractions are split after subsampling the dataset # Also note that the holdout data set is not a part of atom's dataset print(\"Length loaded data:\", len(X)) print(\"Length dataset + holdout:\", len(atom.dataset) + len(atom.holdout)) <pre>Length loaded data: 142193\nLength dataset + holdout: 71096\n</pre> In\u00a0[5]: Copied! <pre>atom.impute()\natom.encode()\n</pre> atom.impute() atom.encode() <pre>Fitting Imputer...\nImputing missing values...\n --&gt; Dropping 258 samples due to missing values in feature MinTemp.\n --&gt; Dropping 127 samples due to missing values in feature MaxTemp.\n --&gt; Dropping 553 samples due to missing values in feature Rainfall.\n --&gt; Dropping 24308 samples due to missing values in feature Evaporation.\n --&gt; Dropping 27187 samples due to missing values in feature Sunshine.\n --&gt; Dropping 3739 samples due to missing values in feature WindGustDir.\n --&gt; Dropping 3712 samples due to missing values in feature WindGustSpeed.\n --&gt; Dropping 3995 samples due to missing values in feature WindDir9am.\n --&gt; Dropping 1508 samples due to missing values in feature WindDir3pm.\n --&gt; Dropping 539 samples due to missing values in feature WindSpeed9am.\n --&gt; Dropping 1077 samples due to missing values in feature WindSpeed3pm.\n --&gt; Dropping 706 samples due to missing values in feature Humidity9am.\n --&gt; Dropping 1447 samples due to missing values in feature Humidity3pm.\n --&gt; Dropping 5610 samples due to missing values in feature Pressure9am.\n --&gt; Dropping 5591 samples due to missing values in feature Pressure3pm.\n --&gt; Dropping 21520 samples due to missing values in feature Cloud9am.\n --&gt; Dropping 22921 samples due to missing values in feature Cloud3pm.\n --&gt; Dropping 365 samples due to missing values in feature Temp9am.\n --&gt; Dropping 1106 samples due to missing values in feature Temp3pm.\n --&gt; Dropping 553 samples due to missing values in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 26 classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n</pre> In\u00a0[6]: Copied! <pre># Unlike train and test, the holdout data set is not transformed until used for predictions\natom.holdout\n</pre> # Unlike train and test, the holdout data set is not transformed until used for predictions atom.holdout Out[6]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 22540 NorahHead 15.8 23.7 0.4 NaN NaN SSW 50.0 NW NaN ... 79.0 80.0 1012.4 1009.6 NaN NaN 18.4 18.9 No 0 22541 Brisbane 13.0 24.1 0.0 3.2 3.6 W 24.0 SW WSW ... 53.0 27.0 1019.9 1015.9 7.0 8.0 17.3 22.1 No 0 22542 MountGambier 14.7 36.2 0.0 7.2 12.5 S 33.0 N SSW ... 52.0 27.0 1018.8 1017.4 7.0 2.0 25.2 35.4 No 0 22543 Launceston 12.3 21.4 0.0 NaN NaN NNW 52.0 NNW NNW ... 62.0 60.0 NaN NaN 5.0 8.0 16.2 20.4 No 0 22544 MountGinini 3.2 10.0 0.0 NaN NaN WSW 52.0 WSW WSW ... 97.0 95.0 NaN NaN NaN NaN 6.5 8.4 No 0 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 36754 MountGinini 1.6 4.4 0.0 NaN NaN E 52.0 E E ... 100.0 100.0 NaN NaN NaN NaN 2.7 2.6 No 1 36755 WaggaWagga 9.9 21.8 0.0 4.6 5.7 WSW 35.0 S SW ... 57.0 36.0 1015.5 1013.7 7.0 7.0 17.0 21.3 No 0 36756 Walpole 8.8 16.3 0.8 NaN NaN NNW 37.0 NNE N ... 84.0 79.0 1018.4 1013.5 NaN NaN 11.0 14.6 No 1 36757 Dartmoor 8.7 15.5 2.0 1.4 5.4 S 30.0 WSW SSW ... 100.0 94.0 1018.6 1020.0 NaN NaN 12.9 12.8 Yes 0 36758 SydneyAirport 16.8 22.6 8.4 5.0 3.8 S 57.0 WNW S ... 79.0 75.0 1013.2 1013.7 8.0 6.0 17.1 18.8 Yes 0 <p>14219 rows \u00d7 22 columns</p> In\u00a0[7]: Copied! <pre>atom.run(models=[\"GNB\", \"LR\", \"RF\"])\n</pre> atom.run(models=[\"GNB\", \"LR\", \"RF\"]) <pre>\nTraining ========================= &gt;&gt;\nModels: GNB, LR, RF\nMetric: f1\n\n\nResults for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.604\nTest evaluation --&gt; f1: 0.6063\nTime elapsed: 0.209s\n-------------------------------------------------\nTime: 0.209s\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6188\nTest evaluation --&gt; f1: 0.6162\nTime elapsed: 0.323s\n-------------------------------------------------\nTime: 0.323s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.6084\nTime elapsed: 4.533s\n-------------------------------------------------\nTime: 4.533s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.734s\n-------------------------------------\nGaussianNB         --&gt; f1: 0.6063\nLogisticRegression --&gt; f1: 0.6162 !\nRandomForest       --&gt; f1: 0.6084 ~\n</pre> In\u00a0[8]: Copied! <pre>atom.plot_prc()\n</pre> atom.plot_prc() In\u00a0[9]: Copied! <pre># Based on the results on the test set, we select the best model for further tuning\natom.run(\"lr_tuned\", n_trials=10)\n</pre> # Based on the results on the test set, we select the best model for further tuning atom.run(\"lr_tuned\", n_trials=10) <pre>\nTraining ========================= &gt;&gt;\nModels: LR_tuned\nMetric: f1\n\n\nRunning hyperparameter tuning for LogisticRegression...\n| trial | penalty |       C |  solver | max_iter | l1_ratio |      f1 | best_f1 | time_trial | time_ht |    state |\n| ----- | ------- | ------- | ------- | -------- | -------- | ------- | ------- | ---------- | ------- | -------- |\n| 0     |    None |  0.1893 |     sag |      540 |      0.4 |  0.6096 |  0.6096 |     0.797s |  0.797s | COMPLETE |\n| 1     |      l2 |  0.6275 | newto.. |      150 |      0.7 |  0.6101 |  0.6101 |     0.637s |  1.433s | COMPLETE |\n| 2     |      l1 |  0.7457 | libli.. |      740 |      0.7 |  0.6114 |  0.6114 |     0.815s |  2.248s | COMPLETE |\n| 3     |      l2 |  0.0759 | newto.. |      290 |      0.4 |  0.6204 |  0.6204 |     0.634s |  2.882s | COMPLETE |\n| 4     |      l2 |  0.2122 | newto.. |      730 |      0.9 |  0.6273 |  0.6273 |     0.635s |  3.516s | COMPLETE |\n| 5     |      l2 |  0.0017 |   lbfgs |      260 |      1.0 |   0.589 |  0.6273 |     0.581s |  4.097s | COMPLETE |\n| 6     |      l2 |  0.0137 |     sag |      130 |      0.4 |  0.6092 |  0.6273 |     0.615s |  4.711s | COMPLETE |\n| 7     |    None |  0.0014 |     sag |      640 |      0.1 |  0.5909 |  0.6273 |     0.725s |  5.436s | COMPLETE |\n| 8     |      l2 |  0.0224 |     sag |      500 |      1.0 |  0.6226 |  0.6273 |     0.653s |  6.089s | COMPLETE |\n| 9     |      l1 |  0.1594 |    saga |      630 |      0.2 |  0.6236 |  0.6273 |     0.810s |  6.898s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 4\nBest parameters:\n --&gt; penalty: l2\n --&gt; C: 0.2122\n --&gt; solver: newton-cg\n --&gt; max_iter: 730\n --&gt; l1_ratio: 0.9\nBest evaluation --&gt; f1: 0.6273\nTime elapsed: 6.898s\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6188\nTest evaluation --&gt; f1: 0.6172\nTime elapsed: 0.352s\n-------------------------------------------------\nTime: 7.251s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 7.461s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.6172\n</pre> <p>We already used the test set to choose the best model for futher tuning, so this set is no longer truly independent. Although it may not be directly visible in the results, using the test set now to evaluate the tuned LR model would be a mistake, since it carries a bias. For this reason, we have set apart an extra, indepedent set to validate the final model: the holdout set. If we are not going to use the test set for validation, we might as well use it to train the model and so optimize the use of the available data. Use the full_train method for this.</p> In\u00a0[10]: Copied! <pre># Re-train the model on the full dataset (train + test) \natom.lr_tuned.full_train()\n</pre> # Re-train the model on the full dataset (train + test)  atom.lr_tuned.full_train() <pre>Fit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6185\nTest evaluation --&gt; f1: 0.6185\nTime elapsed: 0.717s\n</pre> In\u00a0[11]: Copied! <pre># Evaluate on the holdout set\natom.lr_tuned.evaluate(rows=\"holdout\")\n</pre> # Evaluate on the holdout set atom.lr_tuned.evaluate(rows=\"holdout\") Out[11]: <pre>accuracy     0.8577\nap           0.7473\nba           0.7480\nf1           0.6352\njaccard      0.4654\nmcc          0.5606\nprecision    0.7559\nrecall       0.5477\nauc          0.8873\nName: LR_tuned, dtype: float64</pre> In\u00a0[13]: Copied! <pre>atom.lr_tuned.plot_prc(rows=\"holdout\", legend=\"upper right\")\n</pre> atom.lr_tuned.plot_prc(rows=\"holdout\", legend=\"upper right\")"}, {"location": "examples/holdout_set/#example-holdout-set", "title": "Example: Holdout set\u00b6", "text": "<p>This example shows when and how to use ATOM's holdout set in an exploration pipeline.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/holdout_set/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/holdout_set/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/holdout_set/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/hyperparameter_tuning/", "title": "Hyperparameter tuning", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom optuna.distributions import IntDistribution\nfrom atom import ATOMClassifier\n</pre> # Import packages from sklearn.datasets import load_breast_cancer from optuna.distributions import IntDistribution from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX, y = load_breast_cancer(return_X_y=True)\n</pre> # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied! <pre># Initialize atom\natom = ATOMClassifier(X, y, n_jobs=4, verbose=2, random_state=1)\n</pre> # Initialize atom atom = ATOMClassifier(X, y, n_jobs=4, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\nParallel processing with 4 cores.\nParallelization backend: loky\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n</pre> In\u00a0[4]: Copied! <pre># Train a MultiLayerPerceptron model on two metrics\n# using a custom number of hidden layers\natom.run(\n    models=\"MLP\",\n    metric=[\"f1\", \"ap\"],\n    n_trials=10,\n    est_params={\"activation\": \"relu\"},\n    ht_params={\n        \"distributions\": {\n            \"hidden_layer_1\": IntDistribution(2, 4),\n            \"hidden_layer_2\": IntDistribution(10, 20),\n            \"hidden_layer_3\": IntDistribution(10, 20),\n            \"hidden_layer_4\": IntDistribution(2, 4),\n        }\n    }\n)\n</pre> # Train a MultiLayerPerceptron model on two metrics # using a custom number of hidden layers atom.run(     models=\"MLP\",     metric=[\"f1\", \"ap\"],     n_trials=10,     est_params={\"activation\": \"relu\"},     ht_params={         \"distributions\": {             \"hidden_layer_1\": IntDistribution(2, 4),             \"hidden_layer_2\": IntDistribution(10, 20),             \"hidden_layer_3\": IntDistribution(10, 20),             \"hidden_layer_4\": IntDistribution(2, 4),         }     } ) <pre>\nTraining ========================= &gt;&gt;\nModels: MLP\nMetric: f1, ap\n\n\nRunning hyperparameter tuning for MultiLayerPerceptron...\n| trial | hidden_layer_1 | hidden_layer_2 | hidden_layer_3 | hidden_layer_4 |      f1 | best_f1 |      ap | best_ap | time_trial | time_ht |    state |\n| ----- | -------------- | -------------- | -------------- | -------------- | ------- | ------- | ------- | ------- | ---------- | ------- | -------- |\n| 0     |              3 |             17 |             10 |              2 |  0.9464 |  0.9464 |  0.9844 |  0.9844 |     9.139s |  9.139s | COMPLETE |\n| 1     |              2 |             11 |             12 |              3 |  0.9744 |  0.9744 |  0.9991 |  0.9991 |    11.466s | 20.605s | COMPLETE |\n| 2     |              3 |             15 |             14 |              4 |  0.9915 |  0.9915 |  0.9978 |  0.9991 |     8.570s | 29.175s | COMPLETE |\n| 3     |              2 |             19 |             10 |              4 |  0.9655 |  0.9915 |  0.9878 |  0.9991 |     9.208s | 38.383s | COMPLETE |\n| 4     |              3 |             16 |             11 |              2 |  0.9661 |  0.9915 |  0.9981 |  0.9991 |     0.657s | 39.039s | COMPLETE |\n| 5     |              4 |             20 |             13 |              4 |  0.9739 |  0.9915 |  0.9989 |  0.9991 |     0.623s | 39.662s | COMPLETE |\n| 6     |              4 |             19 |             10 |              2 |  0.9828 |  0.9915 |  0.9907 |  0.9991 |     0.601s | 40.263s | COMPLETE |\n| 7     |              2 |             19 |             11 |              3 |  0.7733 |  0.9915 |  0.9997 |  0.9997 |     0.601s | 40.863s | COMPLETE |\n| 8     |              4 |             15 |             17 |              2 |  0.9915 |  0.9915 |  0.9997 |  0.9997 |     0.601s | 41.464s | COMPLETE |\n| 9     |              4 |             19 |             10 |              4 |  0.9828 |  0.9915 |  0.9822 |  0.9997 |     0.599s | 42.062s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 8\nBest parameters:\n --&gt; hidden_layer_sizes: (4, 15, 17, 2)\nBest evaluation --&gt; f1: 0.9915   ap: 0.9997\nTime elapsed: 42.062s\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9965   ap: 0.9991\nTest evaluation --&gt; f1: 0.9718   ap: 0.9938\nTime elapsed: 1.515s\n-------------------------------------------------\nTime: 43.578s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 43.815s\n-------------------------------------\nMultiLayerPerceptron --&gt; f1: 0.9718   ap: 0.9938\n</pre> In\u00a0[5]: Copied! <pre># For multi-metric runs, the selected best trial is the first in the Pareto front\natom.mlp.best_trial\n</pre> # For multi-metric runs, the selected best trial is the first in the Pareto front atom.mlp.best_trial Out[5]: <pre>FrozenTrial(number=8, state=1, values=[0.9914529914529915, 0.9997077732320282], datetime_start=datetime.datetime(2023, 11, 4, 19, 13, 50, 113304), datetime_complete=datetime.datetime(2023, 11, 4, 19, 13, 50, 713850), params={'hidden_layer_1': 4, 'hidden_layer_2': 15, 'hidden_layer_3': 17, 'hidden_layer_4': 2}, user_attrs={'estimator': MLPClassifier(hidden_layer_sizes=(4, 15, 17, 2), random_state=1)}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'hidden_layer_1': IntDistribution(high=4, log=False, low=2, step=1), 'hidden_layer_2': IntDistribution(high=20, log=False, low=10, step=1), 'hidden_layer_3': IntDistribution(high=20, log=False, low=10, step=1), 'hidden_layer_4': IntDistribution(high=4, log=False, low=2, step=1)}, trial_id=8, value=None)</pre> In\u00a0[6]: Copied! <pre>atom.plot_pareto_front()\n</pre> atom.plot_pareto_front() In\u00a0[7]: Copied! <pre># If you are unhappy with the results, it's possible to conitnue the study\natom.mlp.hyperparameter_tuning(n_trials=5)\n</pre> # If you are unhappy with the results, it's possible to conitnue the study atom.mlp.hyperparameter_tuning(n_trials=5) <pre>Running hyperparameter tuning for MultiLayerPerceptron...\n| trial | hidden_layer_1 | hidden_layer_2 | hidden_layer_3 | hidden_layer_4 |      f1 | best_f1 |      ap | best_ap | time_trial | time_ht |    state |\n| ----- | -------------- | -------------- | -------------- | -------------- | ------- | ------- | ------- | ------- | ---------- | ------- | -------- |\n| 10    |              4 |             18 |             13 |              4 |  0.9831 |  0.9915 |  0.9997 |  0.9997 |     0.673s | 42.735s | COMPLETE |\n| 11    |              2 |             14 |             19 |              2 |  0.9421 |  0.9915 |  0.9899 |  0.9997 |     0.604s | 43.339s | COMPLETE |\n| 12    |              2 |             11 |             10 |              4 |  0.7733 |  0.9915 |    0.99 |  0.9997 |     0.617s | 43.955s | COMPLETE |\n| 13    |              2 |             12 |             15 |              2 |  0.9558 |  0.9915 |  0.9985 |  0.9997 |     0.595s | 44.550s | COMPLETE |\n| 14    |              3 |             11 |             16 |              4 |  0.7733 |  0.9915 |  0.9721 |  0.9997 |     0.663s | 45.212s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 8\nBest parameters:\n --&gt; hidden_layer_sizes: (4, 15, 17, 2)\nBest evaluation --&gt; f1: 0.9915   ap: 0.9997\nTime elapsed: 45.212s\n</pre> In\u00a0[8]: Copied! <pre># The trials attribute gives an overview of the trial results\natom.mlp.trials\n</pre> # The trials attribute gives an overview of the trial results atom.mlp.trials Out[8]: hidden_layer_1 hidden_layer_2 hidden_layer_3 hidden_layer_4 estimator f1 best_f1 ap best_ap time_trial time_ht state trial 0 3 17 10 2 MLPClassifier(hidden_layer_sizes=(3, 17, 10, 2... 0.946429 0.991453 0.984402 0.999708 9.138911 9.138911 COMPLETE 1 2 11 12 3 MLPClassifier(hidden_layer_sizes=(2, 11, 12, 3... 0.974359 0.991453 0.999128 0.999708 11.466475 20.605386 COMPLETE 2 3 15 14 4 MLPClassifier(hidden_layer_sizes=(3, 15, 14, 4... 0.991453 0.991453 0.997842 0.999708 8.569545 29.174931 COMPLETE 3 2 19 10 4 MLPClassifier(hidden_layer_sizes=(2, 19, 10, 4... 0.965517 0.991453 0.987805 0.999708 9.207920 38.382851 COMPLETE 4 3 16 11 2 MLPClassifier(hidden_layer_sizes=(3, 16, 11, 2... 0.966102 0.991453 0.998086 0.999708 0.656597 39.039448 COMPLETE 5 4 20 13 4 MLPClassifier(hidden_layer_sizes=(4, 20, 13, 4... 0.973913 0.991453 0.998855 0.999708 0.622566 39.662014 COMPLETE 6 4 19 10 2 MLPClassifier(hidden_layer_sizes=(4, 19, 10, 2... 0.982759 0.991453 0.990748 0.999708 0.600547 40.262561 COMPLETE 7 2 19 11 3 MLPClassifier(hidden_layer_sizes=(2, 19, 11, 3... 0.773333 0.991453 0.999708 0.999708 0.600546 40.863107 COMPLETE 8 4 15 17 2 MLPClassifier(hidden_layer_sizes=(4, 15, 17, 2... 0.991453 0.991453 0.999708 0.999708 0.600546 41.463653 COMPLETE 9 4 19 10 4 MLPClassifier(hidden_layer_sizes=(4, 19, 10, 4... 0.982759 0.991453 0.982168 0.999708 0.598815 42.062468 COMPLETE 10 4 18 13 4 MLPClassifier(hidden_layer_sizes=(4, 18, 13, 4... 0.983051 0.991453 0.999708 0.999708 0.672611 42.735079 COMPLETE 11 2 14 19 2 MLPClassifier(hidden_layer_sizes=(2, 14, 19, 2... 0.942149 0.991453 0.989914 0.999708 0.603549 43.338628 COMPLETE 12 2 11 10 4 MLPClassifier(hidden_layer_sizes=(2, 11, 10, 4... 0.773333 0.991453 0.990024 0.999708 0.616561 43.955189 COMPLETE 13 2 12 15 2 MLPClassifier(hidden_layer_sizes=(2, 12, 15, 2... 0.955752 0.991453 0.998518 0.999708 0.594541 44.549730 COMPLETE 14 3 11 16 4 MLPClassifier(hidden_layer_sizes=(3, 11, 16, 4... 0.773333 0.991453 0.972070 0.999708 0.662602 45.212332 COMPLETE In\u00a0[9]: Copied! <pre># Select a custom best trial...\natom.mlp.best_trial = 2\n\n# ...and check that the best parameters are now those in the selected trial\natom.mlp.best_params\n</pre> # Select a custom best trial... atom.mlp.best_trial = 2  # ...and check that the best parameters are now those in the selected trial atom.mlp.best_params Out[9]: <pre>{'hidden_layer_sizes': (3, 15, 14, 4)}</pre> In\u00a0[10]: Copied! <pre># Lastly, fit the model on the complete training set \n# using the new combination of hyperparameters\natom.mlp.fit()\n</pre> # Lastly, fit the model on the complete training set  # using the new combination of hyperparameters atom.mlp.fit() <pre>Fit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9983   ap: 0.9998\nTest evaluation --&gt; f1: 0.9718   ap: 0.9947\nTime elapsed: 3.048s\n</pre> In\u00a0[11]: Copied! <pre>atom.plot_trials()\n</pre> atom.plot_trials() In\u00a0[12]: Copied! <pre>atom.plot_parallel_coordinate()\n</pre> atom.plot_parallel_coordinate()"}, {"location": "examples/hyperparameter_tuning/#example-hyperparameter-tuning", "title": "Example: Hyperparameter tuning\u00b6", "text": "<p>This example shows an advanced example on how to optimize your model's hyperparameters for multi-metric runs.</p> <p>Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.</p>"}, {"location": "examples/hyperparameter_tuning/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/hyperparameter_tuning/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/hyperparameter_tuning/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/", "title": "Imbalanced datasets", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import make_classification\n</pre> # Import packages from atom import ATOMClassifier from sklearn.datasets import make_classification In\u00a0[2]: Copied! <pre># Create a mock imbalanced dataset\nX, y = make_classification(\n    n_samples=5000,\n    n_features=30,\n    n_informative=20,\n    weights=(0.95,),\n    random_state=1,\n)\n</pre> # Create a mock imbalanced dataset X, y = make_classification(     n_samples=5000,     n_features=30,     n_informative=20,     weights=(0.95,),     random_state=1, ) In\u00a0[3]: Copied! <pre># Initialize atom\natom = ATOMClassifier(X, y, test_size=0.2, verbose=2, random_state=1)\n</pre> # Initialize atom atom = ATOMClassifier(X, y, test_size=0.2, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (5000, 31)\nTrain set size: 4000\nTest set size: 1000\n-------------------------------------\nMemory: 1.24 MB\nScaled: False\nOutlier values: 570 (0.5%)\n\n</pre> In\u00a0[4]: Copied! <pre># Let's have a look at the data. Note that, since the input wasn't\n# a dataframe, atom has given default names to the columns.\natom.head()\n</pre> # Let's have a look at the data. Note that, since the input wasn't # a dataframe, atom has given default names to the columns. atom.head() Out[4]: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x21 x22 x23 x24 x25 x26 x27 x28 x29 target 0 -0.535760 -2.426045 1.256836 0.374501 -3.241958 -1.239468 -0.208750 -6.015995 3.698669 0.112512 ... 0.044302 -1.935727 10.870353 0.286755 -2.416507 0.556990 -1.522635 3.719201 1.449135 0 1 -3.311935 -3.149920 -0.801252 -2.644414 -0.704889 -3.312256 0.714515 2.992345 5.056910 3.036775 ... 2.224359 0.451273 -1.822108 -1.435801 0.036132 -1.364583 1.215663 5.232161 1.408798 0 2 3.821199 1.328129 -1.000720 -13.151697 0.254253 1.263636 -1.088451 4.924264 -1.225646 -6.974824 ... 3.541222 1.686667 -13.763703 -1.321256 1.677687 0.774966 -5.067689 4.663386 -1.714186 0 3 5.931126 3.338830 0.545906 2.296355 -3.941088 3.527252 -0.158770 3.138381 -0.927460 -1.642079 ... -3.634442 7.853176 -8.457598 0.000490 -2.612756 -1.138206 0.497150 4.351289 -0.321748 0 4 -2.829472 -1.227185 -0.751892 3.056106 -1.988920 -2.219184 -0.075882 5.790102 -2.786671 2.023458 ... 4.057954 1.178564 -15.028187 1.627140 -1.093587 -0.422655 1.777011 6.660638 -2.553723 0 <p>5 rows \u00d7 31 columns</p> In\u00a0[6]: Copied! <pre># Let's start reducing the number of features\natom.feature_selection(\"rfe\", solver=\"rf\", n_features=12)\n</pre> # Let's start reducing the number of features atom.feature_selection(\"rfe\", solver=\"rf\", n_features=12) <pre>Fitting FeatureSelector...\nPerforming feature selection ...\n --&gt; rfe selected 12 features from the dataset.\n   --&gt; Dropping feature x1 (rank 8).\n   --&gt; Dropping feature x2 (rank 11).\n   --&gt; Dropping feature x4 (rank 3).\n   --&gt; Dropping feature x6 (rank 16).\n   --&gt; Dropping feature x7 (rank 14).\n   --&gt; Dropping feature x10 (rank 19).\n   --&gt; Dropping feature x12 (rank 13).\n   --&gt; Dropping feature x13 (rank 12).\n   --&gt; Dropping feature x14 (rank 9).\n   --&gt; Dropping feature x16 (rank 10).\n   --&gt; Dropping feature x18 (rank 17).\n   --&gt; Dropping feature x19 (rank 2).\n   --&gt; Dropping feature x20 (rank 4).\n   --&gt; Dropping feature x22 (rank 7).\n   --&gt; Dropping feature x23 (rank 5).\n   --&gt; Dropping feature x24 (rank 18).\n   --&gt; Dropping feature x25 (rank 6).\n   --&gt; Dropping feature x26 (rank 15).\n</pre> In\u00a0[7]: Copied! <pre># Fit a model directly on the imbalanced data\natom.run(\"RF\", metric=\"ba\")\n</pre> # Fit a model directly on the imbalanced data atom.run(\"RF\", metric=\"ba\") <pre>\nTraining ========================= &gt;&gt;\nModels: RF\nMetric: ba\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; ba: 1.0\nTest evaluation --&gt; ba: 0.5556\nTime elapsed: 2.497s\n-------------------------------------------------\nTime: 2.497s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 2.568s\n-------------------------------------\nRandomForest --&gt; ba: 0.5556 ~\n</pre> In\u00a0[8]: Copied! <pre># The transformer and the models have been added to the branch\natom.branch\n</pre> # The transformer and the models have been added to the branch atom.branch Out[8]: <pre>Branch(main)</pre> In\u00a0[9]: Copied! <pre># Create a new branch for oversampling\natom.branch = \"oversample\"\n</pre> # Create a new branch for oversampling atom.branch = \"oversample\" <pre>Successfully created new branch: oversample.\n</pre> In\u00a0[10]: Copied! <pre># Perform oversampling of the minority class\natom.balance(strategy=\"smote\")\n</pre> # Perform oversampling of the minority class atom.balance(strategy=\"smote\") <pre>Oversampling with SMOTE...\n --&gt; Adding 3570 samples to class 1.\n</pre> In\u00a0[11]: Copied! <pre>atom.classes  # Check the balanced training set!\n</pre> atom.classes  # Check the balanced training set! Out[11]: dataset train test 0 4731 3785 946 1 3839 3785 54 In\u00a0[12]: Copied! <pre># Train another model on the new branch. Add a tag after \n# the model's acronym to distinguish it from the first model\natom.run(\"rf_os\")  # os for oversample\n</pre> # Train another model on the new branch. Add a tag after  # the model's acronym to distinguish it from the first model atom.run(\"rf_os\")  # os for oversample <pre>\nTraining ========================= &gt;&gt;\nModels: RF_os\nMetric: ba\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; ba: 1.0\nTest evaluation --&gt; ba: 0.7672\nTime elapsed: 4.136s\n-------------------------------------------------\nTime: 4.136s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 4.248s\n-------------------------------------\nRandomForest --&gt; ba: 0.7672 ~\n</pre> In\u00a0[14]: Copied! <pre># Create the undersampling branch\n# Split from master to not adopt the oversmapling transformer\natom.branch = \"undersample_from_main\"\n</pre> # Create the undersampling branch # Split from master to not adopt the oversmapling transformer atom.branch = \"undersample_from_main\" <pre>Successfully created new branch: undersample.\n</pre> In\u00a0[15]: Copied! <pre>atom.classes  # In this branch, the data is still imbalanced\n</pre> atom.classes  # In this branch, the data is still imbalanced Out[15]: dataset train test 0 4731 3785 946 1 269 215 54 In\u00a0[16]: Copied! <pre># Perform undersampling of the majority class\natom.balance(strategy=\"NearMiss\")\n</pre> # Perform undersampling of the majority class atom.balance(strategy=\"NearMiss\") <pre>Undersampling with NearMiss...\n --&gt; Removing 3570 samples from class 0.\n</pre> In\u00a0[17]: Copied! <pre>atom.run(\"rf_us\")\n</pre> atom.run(\"rf_us\") <pre>\nTraining ========================= &gt;&gt;\nModels: RF_us\nMetric: ba\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; ba: 1.0\nTest evaluation --&gt; ba: 0.6706\nTime elapsed: 0.285s\n-------------------------------------------------\nTime: 0.285s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.321s\n-------------------------------------\nRandomForest --&gt; ba: 0.6706 ~\n</pre> In\u00a0[18]: Copied! <pre># Check that the branch only contains the desired transformers \natom.branch\n</pre> # Check that the branch only contains the desired transformers  atom.branch Out[18]: <pre>Branch(undersample)</pre> In\u00a0[19]: Copied! <pre># Visualize the complete pipeline\natom.plot_pipeline()\n</pre> # Visualize the complete pipeline atom.plot_pipeline() In\u00a0[20]: Copied! <pre>atom.evaluate()\n</pre> atom.evaluate() Out[20]: accuracy ap ba f1 jaccard mcc precision recall auc RF 0.952 0.6562 0.5556 0.2000 0.1111 0.3252 1.000 0.1111 0.9107 RF_os 0.956 0.6215 0.7672 0.5769 0.4054 0.5542 0.600 0.5556 0.9251 RF_us 0.509 0.3687 0.6706 0.1578 0.0857 0.1545 0.087 0.8519 0.8258 In\u00a0[21]: Copied! <pre>atom.plot_prc()\n</pre> atom.plot_prc() In\u00a0[22]: Copied! <pre>atom.plot_roc()\n</pre> atom.plot_roc()"}, {"location": "examples/imbalanced_datasets/#example-imbalanced-datasets", "title": "Example: Imbalanced datasets\u00b6", "text": "<p>This example shows how ATOM can help you handle imbalanced datasets. We will evaluate the performance of three different Random Forest models: one trained directly on the imbalanced dataset, one trained on an oversampled dataset and the last one trained on an undersampled dataset.</p>"}, {"location": "examples/imbalanced_datasets/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#oversampling", "title": "Oversampling\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#undersampling", "title": "Undersampling\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/in_training_validation/", "title": "In-training validation", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom atom import ATOMClassifier\n</pre> # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX, y = load_breast_cancer(return_X_y=True)\n</pre> # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied! <pre># Initialize atom\natom = ATOMClassifier(X, y, verbose=2, random_state=1)\n</pre> # Initialize atom atom = ATOMClassifier(X, y, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n</pre> In\u00a0[4]: Copied! <pre># Not all models support in-training validation\n# You can chek which ones do using the available_models method\ndf = atom.available_models()[[\"acronym\", \"model\", \"has_validation\"]]\ndf[df[\"has_validation\"]]\n</pre> # Not all models support in-training validation # You can chek which ones do using the available_models method df = atom.available_models()[[\"acronym\", \"model\", \"has_validation\"]] df[df[\"has_validation\"]] Out[4]: acronym model has_validation 3 CatB CatBoost True 15 LGB LightGBM True 19 MLP MultiLayerPerceptron True 21 PA PassiveAggressive True 22 Perc Perceptron True 27 SGD StochasticGradientDescent True 29 XGB XGBoost True In\u00a0[5]: Copied! <pre># Run the models normally\natom.run(models=[\"MLP\", \"LGB\"], metric=\"auc\")\n</pre> # Run the models normally atom.run(models=[\"MLP\", \"LGB\"], metric=\"auc\") <pre>\nTraining ========================= &gt;&gt;\nModels: MLP, LGB\nMetric: auc\n\n\nResults for MultiLayerPerceptron:\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.9997\nTest evaluation --&gt; auc: 0.9936\nTime elapsed: 1.821s\n-------------------------------------------------\nTime: 1.821s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 1.0\nTest evaluation --&gt; auc: 0.9775\nTime elapsed: 0.352s\n-------------------------------------------------\nTime: 0.352s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 2.236s\n-------------------------------------\nMultiLayerPerceptron --&gt; auc: 0.9936 !\nLightGBM             --&gt; auc: 0.9775\n</pre> In\u00a0[6]: Copied! <pre>atom.plot_evals(title=\"In-training validation scores\")\n</pre> atom.plot_evals(title=\"In-training validation scores\") In\u00a0[7]: Copied! <pre># Plot the validation on the train and test set\natom.lgb.plot_evals(dataset=\"train+test\", title=\"LightGBM's in-training validation\")\n</pre> # Plot the validation on the train and test set atom.lgb.plot_evals(dataset=\"train+test\", title=\"LightGBM's in-training validation\")"}, {"location": "examples/in_training_validation/#example-in-training-validation", "title": "Example: In-training validation\u00b6", "text": "<p>This example shows how to keep track of the model's performance during training.</p> <p>Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.</p>"}, {"location": "examples/in_training_validation/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/in_training_validation/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/in_training_validation/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/memory_considerations/", "title": "Memory considerations", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport os\nimport tempfile\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import os import tempfile import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre># Define a temp directory to store the files in this example\ntempdir = tempfile.gettempdir()\n</pre> # Define a temp directory to store the files in this example tempdir = tempfile.gettempdir() In\u00a0[4]: Copied! <pre>def get_size(filepath):\n    \"\"\"Return the size of the object in MB.\"\"\"\n    return f\"{os.path.getsize(filepath + '.pkl') / 1e6:.2f}MB\"\n</pre> def get_size(filepath):     \"\"\"Return the size of the object in MB.\"\"\"     return f\"{os.path.getsize(filepath + '.pkl') / 1e6:.2f}MB\" In\u00a0[5]: Copied! <pre>atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2)\n</pre> atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\n</pre> <p>Note that the datset takes ~25MB. We can reduce the size of the dataset using the shrink method, which reduces the dtypes to their smallest possible value.</p> In\u00a0[6]: Copied! <pre>atom.dtypes\n</pre> atom.dtypes Out[6]: <pre>Location          object\nMinTemp          float64\nMaxTemp          float64\nRainfall         float64\nEvaporation      float64\nSunshine         float64\nWindGustDir       object\nWindGustSpeed    float64\nWindDir9am        object\nWindDir3pm        object\nWindSpeed9am     float64\nWindSpeed3pm     float64\nHumidity9am      float64\nHumidity3pm      float64\nPressure9am      float64\nPressure3pm      float64\nCloud9am         float64\nCloud3pm         float64\nTemp9am          float64\nTemp3pm          float64\nRainToday         object\nRainTomorrow       int64\ndtype: object</pre> In\u00a0[7]: Copied! <pre>atom.shrink(str2cat=True)\n</pre> atom.shrink(str2cat=True) <pre>The column dtypes are successfully converted.\n</pre> In\u00a0[8]: Copied! <pre>atom.dtypes\n</pre> atom.dtypes Out[8]: <pre>Location         category\nMinTemp           Float32\nMaxTemp           Float32\nRainfall          Float32\nEvaporation       Float32\nSunshine          Float32\nWindGustDir      category\nWindGustSpeed       Int16\nWindDir9am       category\nWindDir3pm       category\nWindSpeed9am        Int16\nWindSpeed3pm         Int8\nHumidity9am          Int8\nHumidity3pm          Int8\nPressure9am       Float32\nPressure3pm       Float32\nCloud9am             Int8\nCloud3pm             Int8\nTemp9am           Float32\nTemp3pm           Float32\nRainToday        category\nRainTomorrow         Int8\ndtype: object</pre> In\u00a0[9]: Copied! <pre># Let's check the memory usage again...\n# Notice the huge drop!\natom.stats()\n</pre> # Let's check the memory usage again... # Notice the huge drop! atom.stats() <pre>Dataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 9.67 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n</pre> In\u00a0[10]: Copied! <pre># Now, we create some new branches to train models with different trasnformers\natom.impute()\natom.encode()\natom.run(\"LDA\")\n\natom.branch = \"b2\"\natom.scale()\natom.run(\"LDA_scaled\")\n\natom.branch = \"b3_from_main\"\natom.normalize()\natom.run(\"LDA_norm\")\n</pre> # Now, we create some new branches to train models with different trasnformers atom.impute() atom.encode() atom.run(\"LDA\")  atom.branch = \"b2\" atom.scale() atom.run(\"LDA_scaled\")  atom.branch = \"b3_from_main\" atom.normalize() atom.run(\"LDA_norm\") <pre>Fitting Imputer...\nImputing missing values...\n --&gt; Dropping 637 samples due to missing values in feature MinTemp.\n --&gt; Dropping 322 samples due to missing values in feature MaxTemp.\n --&gt; Dropping 1406 samples due to missing values in feature Rainfall.\n --&gt; Dropping 60843 samples due to missing values in feature Evaporation.\n --&gt; Dropping 67816 samples due to missing values in feature Sunshine.\n --&gt; Dropping 9330 samples due to missing values in feature WindGustDir.\n --&gt; Dropping 9270 samples due to missing values in feature WindGustSpeed.\n --&gt; Dropping 10013 samples due to missing values in feature WindDir9am.\n --&gt; Dropping 3778 samples due to missing values in feature WindDir3pm.\n --&gt; Dropping 1348 samples due to missing values in feature WindSpeed9am.\n --&gt; Dropping 2630 samples due to missing values in feature WindSpeed3pm.\n --&gt; Dropping 1774 samples due to missing values in feature Humidity9am.\n --&gt; Dropping 3610 samples due to missing values in feature Humidity3pm.\n --&gt; Dropping 14014 samples due to missing values in feature Pressure9am.\n --&gt; Dropping 13981 samples due to missing values in feature Pressure3pm.\n --&gt; Dropping 53657 samples due to missing values in feature Cloud9am.\n --&gt; Dropping 57094 samples due to missing values in feature Cloud3pm.\n --&gt; Dropping 904 samples due to missing values in feature Temp9am.\n --&gt; Dropping 2726 samples due to missing values in feature Temp3pm.\n --&gt; Dropping 1406 samples due to missing values in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 26 classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n\nTraining ========================= &gt;&gt;\nModels: LDA\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6213\nTest evaluation --&gt; f1: 0.6341\nTime elapsed: 0.375s\n-------------------------------------------------\nTime: 0.375s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.613s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.6341\nSuccessfully created new branch: b2.\nFitting Scaler...\nScaling features...\n\nTraining ========================= &gt;&gt;\nModels: LDA_scaled\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6213\nTest evaluation --&gt; f1: 0.6341\nTime elapsed: 0.390s\n-------------------------------------------------\nTime: 0.390s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.626s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.6341\nSuccessfully created new branch: b3.\nFitting Normalizer...\nNormalizing features...\n\nTraining ========================= &gt;&gt;\nModels: LDA_norm\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6267\nTest evaluation --&gt; f1: 0.6368\nTime elapsed: 0.369s\n-------------------------------------------------\nTime: 0.369s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.626s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.6368\n</pre> In\u00a0[11]: Copied! <pre># If we save atom now, notice the size\n# This is because atom keeps a copy of every branch in memory\nfilename = tempdir + \"atom1\"\natom.save(filename)\nget_size(filename)\n</pre> # If we save atom now, notice the size # This is because atom keeps a copy of every branch in memory filename = tempdir + \"atom1\" atom.save(filename) get_size(filename) <pre>ATOMClassifier successfully saved.\n</pre> Out[11]: <pre>'34.92MB'</pre> <p>To avoid large memory usages, set the <code>memory</code> parameter.</p> In\u00a0[12]: Copied! <pre>atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1)\natom.shrink(str2cat=True)\natom.impute()\natom.encode()\natom.run(\"LDA\")\n\natom.branch = \"b2\"\natom.scale()\natom.run(\"LDA_scaled\")\n\natom.branch = \"b3_from_main\"\natom.normalize()\natom.run(\"LDA_norm\")\n</pre> atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1) atom.shrink(str2cat=True) atom.impute() atom.encode() atom.run(\"LDA\")  atom.branch = \"b2\" atom.scale() atom.run(\"LDA_scaled\")  atom.branch = \"b3_from_main\" atom.normalize() atom.run(\"LDA_norm\") <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\nCache storage: C:\\Users\\Mavs\\AppData\\Local\\Temp\\joblib\n\nDataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nThe column dtypes are successfully converted.\nFitting Imputer...\nImputing missing values...\nFitting Encoder...\nEncoding categorical columns...\n\nTraining ========================= &gt;&gt;\nModels: LDA\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6233\nTest evaluation --&gt; f1: 0.6248\nTime elapsed: 0.445s\n-------------------------------------------------\nTime: 0.445s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.708s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.6248\nSuccessfully created new branch: b2.\nFitting Scaler...\nScaling features...\n\nTraining ========================= &gt;&gt;\nModels: LDA_scaled\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6233\nTest evaluation --&gt; f1: 0.6248\nTime elapsed: 0.454s\n-------------------------------------------------\nTime: 0.454s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.737s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.6248\nSuccessfully created new branch: b3.\nFitting Normalizer...\nNormalizing features...\n\nTraining ========================= &gt;&gt;\nModels: LDA_norm\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6279\nTest evaluation --&gt; f1: 0.6298\nTime elapsed: 0.447s\n-------------------------------------------------\nTime: 0.447s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.740s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.6298\n</pre> In\u00a0[13]: Copied! <pre># And now, it only takes a fraction of the previous size\n# This is because the data of inactive branches is now stored locally\nfilename = tempdir + \"atom2\"\natom.save(filename)\nget_size(filename)\n</pre> # And now, it only takes a fraction of the previous size # This is because the data of inactive branches is now stored locally filename = tempdir + \"atom2\" atom.save(filename) get_size(filename) <pre>ATOMClassifier successfully saved.\n</pre> Out[13]: <pre>'9.63MB'</pre> <p>Additionnaly, repeated calls to the same transformers with the same data will use the cached results. Don't forget to specify the <code>random_state</code> parameter to ensure the data remains the exact same.</p> In\u00a0[14]: Copied! <pre>atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1)\natom.shrink(str2cat=True)\n</pre> atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1) atom.shrink(str2cat=True) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\nCache storage: C:\\Users\\Mavs\\AppData\\Local\\Temp\\joblib\n\nDataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nThe column dtypes are successfully converted.\n</pre> In\u00a0[15]: Copied! <pre># Note the transformers are no longer fitted,\n# instead the results are immediately read from cache\natom.impute()\natom.encode()\n</pre> # Note the transformers are no longer fitted, # instead the results are immediately read from cache atom.impute() atom.encode() <pre>Retrieving cached results for Imputer...\nRetrieving cached results for Encoder...\nEncoding categorical columns...\n</pre> In\u00a0[16]: Copied! <pre>atom.dataset\n</pre> atom.dataset Out[16]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 0.075703 13.0 30.5 0.0 6.8 10.0 0.271668 59 0.312069 0.273733 ... 19 8 1013.599976 1008.0 0 2 19.6 29.9 0.0 0 1 0.245394 15.3 22.4 16.0 4.2 3.3 0.204934 39 0.236475 0.199626 ... 83 63 1025.5 1023.599976 6 6 16.9 21.1 1.0 1 2 0.262397 27.9 34.5 0.0 9.0 7.9 0.1737 72 0.236475 0.306935 ... 72 63 1009.0 1005.5 7 7 31.0 33.099998 0.0 1 3 0.239174 12.9 27.9 0.0 5.4 8.6 0.269421 39 0.256213 0.286159 ... 69 56 1023.400024 1019.799988 7 7 14.7 23.4 0.0 0 4 0.253089 7.4 14.3 0.8 2.8 4.0 0.210095 31 0.269333 0.167808 ... 84 62 1023.599976 1023.200012 4 7 9.0 13.6 0.0 1 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 56415 0.295559 23.9 28.1 0.0 2.6 7.7 0.241448 44 0.279553 0.259391 ... 86 79 1015.900024 1013.900024 7 7 25.799999 27.5 0.0 0 56416 0.217037 13.6 24.6 0.0 4.4 7.8 0.1737 39 0.193908 0.197102 ... 87 61 1023.200012 1022.599976 7 3 17.299999 21.4 0.0 0 56417 0.112176 16.299999 38.700001 0.0 10.2 13.4 0.1737 24 0.149795 0.168702 ... 29 8 1013.5 1010.299988 5 2 26.4 36.900002 0.0 0 56418 0.295559 11.5 19.200001 0.8 2.0 7.0 0.147458 22 0.13795 0.195807 ... 73 52 1021.299988 1018.799988 3 4 17.1 18.4 0.0 0 56419 0.403054 5.9 18.0 0.4 0.8 6.7 0.269421 26 0.312069 0.286159 ... 92 65 1028.0 1025.300049 3 2 9.4 16.6 0.0 0 <p>56420 rows \u00d7 22 columns</p>"}, {"location": "examples/memory_considerations/#example-memory-considerations", "title": "Example: Memory considerations\u00b6", "text": "<p>This example shows how to use the <code>memory</code> parameter to make efficient use of the available memory.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/memory_considerations/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/memory_considerations/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multi_metric/", "title": "Multi-metric runs", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMRegressor\n</pre> # Import packages import pandas as pd from atom import ATOMRegressor In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\")  # Let's have a look X.head() Out[2]: Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 In\u00a0[3]: Copied! <pre>atom = ATOMRegressor(X, n_jobs=1, verbose=2, random_state=1)\n</pre> atom = ATOMRegressor(X, n_jobs=1, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Regression.\n\nDataset stats ==================== &gt;&gt;\nShape: (4177, 9)\nTrain set size: 3342\nTest set size: 835\n-------------------------------------\nMemory: 300.88 kB\nScaled: False\nCategorical features: 1 (12.5%)\nOutlier values: 189 (0.6%)\n\n</pre> In\u00a0[4]: Copied! <pre>atom.encode()\n</pre> atom.encode() <pre>Fitting Encoder...\nEncoding categorical columns...\n --&gt; OneHot-encoding feature Sex. Contains 3 classes.\n</pre> In\u00a0[5]: Copied! <pre># For every step of the BO, both metrics are calculated,\n# but only the first is used for optimization!\natom.run(\n    models=[\"lsvm\", \"hGBM\"],\n    metric=(\"r2\", \"rmse\"),\n    n_trials=10,\n    n_bootstrap=6,\n)\n</pre> # For every step of the BO, both metrics are calculated, # but only the first is used for optimization! atom.run(     models=[\"lsvm\", \"hGBM\"],     metric=(\"r2\", \"rmse\"),     n_trials=10,     n_bootstrap=6, ) <pre>\nTraining ========================= &gt;&gt;\nModels: lSVM, hGBM\nMetric: r2, rmse\n\n\nRunning hyperparameter tuning for LinearSVM...\n| trial |                    loss |       C |    dual |      r2 | best_r2 |    rmse | best_rmse | time_trial | time_ht |    state |\n| ----- | ----------------------- | ------- | ------- | ------- | ------- | ------- | --------- | ---------- | ------- | -------- |\n| 0     | squared_epsilon_insen.. |   0.001 |    True |  0.2887 |  0.2887 | -2.6528 |   -2.6528 |     0.043s |  0.043s | COMPLETE |\n| 1     | squared_epsilon_insen.. |  0.0534 |   False |  0.3862 |  0.3862 | -2.5926 |   -2.5926 |     0.043s |  0.086s | COMPLETE |\n| 2     | squared_epsilon_insen.. |  0.0105 |    True |   0.433 |   0.433 | -2.4084 |   -2.4084 |     0.054s |  0.140s | COMPLETE |\n| 3     |     epsilon_insensitive |  0.6215 |    True |  0.4022 |   0.433 | -2.5251 |   -2.4084 |     0.045s |  0.185s | COMPLETE |\n| 4     | squared_epsilon_insen.. |  0.0369 |   False |  0.4057 |   0.433 | -2.5477 |   -2.4084 |     0.040s |  0.225s | COMPLETE |\n| 5     |     epsilon_insensitive |  0.0016 |    True | -1.5344 |   0.433 | -5.0102 |   -2.4084 |     0.035s |  0.260s | COMPLETE |\n| 6     | squared_epsilon_insen.. | 61.5811 |   False |  0.4354 |  0.4354 | -2.3845 |   -2.3845 |     0.034s |  0.294s | COMPLETE |\n| 7     | squared_epsilon_insen.. |  14.898 |   False |  0.4925 |  0.4925 | -2.2628 |   -2.2628 |     0.035s |  0.329s | COMPLETE |\n| 8     |     epsilon_insensitive |  0.0252 |    True |  0.3695 |  0.4925 | -2.6178 |   -2.2628 |     0.035s |  0.364s | COMPLETE |\n| 9     | squared_epsilon_insen.. |  0.0294 |    True |  0.4767 |  0.4925 | -2.3896 |   -2.2628 |     0.044s |  0.408s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 7\nBest parameters:\n --&gt; loss: squared_epsilon_insensitive\n --&gt; C: 14.898\n --&gt; dual: False\nBest evaluation --&gt; r2: 0.4925   rmse: -2.2628\nTime elapsed: 0.408s\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.4592   rmse: -2.3795\nTest evaluation --&gt; r2: 0.4584   rmse: -2.3369\nTime elapsed: 0.089s\nBootstrap ---------------------------------------\nEvaluation --&gt; r2: 0.4577 \u00b1 0.002   rmse: -2.3384 \u00b1 0.0043\nTime elapsed: 0.094s\n-------------------------------------------------\nTime: 0.592s\n\n\nRunning hyperparameter tuning for HistGradientBoosting...\n| trial |      loss | quantile | learning_rate | max_iter | max_leaf_nodes | max_depth | min_samples_leaf | l2_regularization |      r2 | best_r2 |    rmse | best_rmse | time_trial | time_ht |    state |\n| ----- | --------- | -------- | ------------- | -------- | -------------- | --------- | ---------------- | ----------------- | ------- | ------- | ------- | --------- | ---------- | ------- | -------- |\n| 0     | absolut.. |      0.1 |        0.0236 |      180 |             26 |        12 |               11 |               0.0 |  0.5373 |  0.5373 | -2.1398 |   -2.1398 |     0.968s |  0.968s | COMPLETE |\n| 1     |     gamma |      0.5 |         0.242 |      160 |             38 |         3 |               20 |               0.0 |   0.574 |   0.574 | -2.1598 |   -2.1398 |     0.160s |  1.128s | COMPLETE |\n| 2     |  quantile |      0.4 |        0.2448 |      210 |             12 |         3 |               25 |               0.3 |  0.4714 |   0.574 | -2.3253 |   -2.1398 |     0.422s |  1.550s | COMPLETE |\n| 3     |  quantile |      0.6 |         0.017 |      480 |             28 |        16 |               13 |               0.1 |  0.5712 |   0.574 | -2.1385 |   -2.1385 |     3.405s |  4.956s | COMPLETE |\n| 4     | squared.. |      1.0 |        0.2649 |       70 |             10 |        10 |               28 |               0.8 |  0.5561 |   0.574 | -2.2019 |   -2.1385 |     0.148s |  5.104s | COMPLETE |\n| 5     | squared.. |      0.1 |        0.0283 |      360 |             32 |         9 |               11 |               0.5 |  0.5464 |   0.574 | -2.1197 |   -2.1197 |     1.248s |  6.352s | COMPLETE |\n| 6     |  quantile |      0.4 |        0.1264 |      380 |             37 |        12 |               29 |               1.0 |  0.4416 |   0.574 | -2.3713 |   -2.1197 |     3.002s |  9.354s | COMPLETE |\n| 7     |     gamma |      0.6 |         0.678 |      330 |             25 |         6 |               12 |               0.8 |  0.4299 |   0.574 | -2.3984 |   -2.1197 |     0.739s | 10.092s | COMPLETE |\n| 8     | absolut.. |      0.9 |        0.0831 |      280 |             42 |        16 |               10 |               1.0 |  0.5242 |   0.574 | -2.2742 |   -2.1197 |     2.002s | 12.094s | COMPLETE |\n| 9     | absolut.. |      0.6 |        0.0373 |      300 |             40 |        13 |               17 |               0.8 |  0.5685 |   0.574 |   -2.17 |   -2.1197 |     1.859s | 13.953s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 5\nBest parameters:\n --&gt; loss: squared_error\n --&gt; quantile: 0.1\n --&gt; learning_rate: 0.0283\n --&gt; max_iter: 360\n --&gt; max_leaf_nodes: 32\n --&gt; max_depth: 9\n --&gt; min_samples_leaf: 11\n --&gt; l2_regularization: 0.5\nBest evaluation --&gt; r2: 0.5464   rmse: -2.1197\nTime elapsed: 13.953s\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.7959   rmse: -1.4619\nTest evaluation --&gt; r2: 0.5479   rmse: -2.1351\nTime elapsed: 1.470s\nBootstrap ---------------------------------------\nEvaluation --&gt; r2: 0.5259 \u00b1 0.0154   rmse: -2.1861 \u00b1 0.0352\nTime elapsed: 7.930s\n-------------------------------------------------\nTime: 23.353s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 25.299s\n-------------------------------------\nLinearSVM            --&gt; r2: 0.4577 \u00b1 0.002   rmse: -2.3384 \u00b1 0.0043\nHistGradientBoosting --&gt; r2: 0.5259 \u00b1 0.0154   rmse: -2.1861 \u00b1 0.0352 ~ !\n</pre> In\u00a0[6]: Copied! <pre># Check the robustness of the pipeline using cross-validation\natom.winner.cross_validate()\n</pre> # Check the robustness of the pipeline using cross-validation atom.winner.cross_validate() <pre>Applying cross-validation...\n</pre> Out[6]: train_r2 test_r2 train_rmse test_rmse time (s) 0 0.796038 0.541990 -1.453147 -2.196943 1.392266 1 0.794954 0.540424 -1.457709 -2.196179 1.436932 2 0.790722 0.505922 -1.492522 -2.153457 1.444314 3 0.785317 0.580703 -1.474827 -2.189902 1.432303 4 0.795872 0.547917 -1.461929 -2.135072 1.747591 mean 0.792581 0.543391 -1.468027 -2.174311 1.490681 std 0.004114 0.023780 0.014222 0.025330 0.129719 In\u00a0[8]: Copied! <pre># The columns in the results dataframe contain one for each metric\natom.results[[\"r2_ht\", \"r2_train\", \"r2_test\", \"rmse_ht\", \"rmse_train\", \"rmse_test\"]]\n</pre> # The columns in the results dataframe contain one for each metric atom.results[[\"r2_ht\", \"r2_train\", \"r2_test\", \"rmse_ht\", \"rmse_train\", \"rmse_test\"]] Out[8]: r2_ht r2_train r2_test rmse_ht rmse_train rmse_test lSVM 0.492530 0.4583 0.4552 -2.262754 -2.3815 -2.3439 hGBM 0.546368 0.7183 0.4971 -2.119672 -1.7173 -2.2518 In\u00a0[9]: Copied! <pre># Some plots allow us to choose the metric we want to show\nwith atom.canvas():\n    atom.plot_trials(metric=\"r2\", title=\"Hyperparameter tuning performance for R2\")\n    atom.plot_trials(metric=\"rmse\", title=\"Hyperparameter tuning performance for RMSE\")\n</pre> # Some plots allow us to choose the metric we want to show with atom.canvas():     atom.plot_trials(metric=\"r2\", title=\"Hyperparameter tuning performance for R2\")     atom.plot_trials(metric=\"rmse\", title=\"Hyperparameter tuning performance for RMSE\") In\u00a0[10]: Copied! <pre>atom.plot_results(metric=\"r2\")\n</pre> atom.plot_results(metric=\"r2\")"}, {"location": "examples/multi_metric/#example-multi-metric-runs", "title": "Example: Multi-metric runs\u00b6", "text": "<p>This example shows how to evaluate an atom's pipeline on multiple metrics.</p> <p>Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.</p>"}, {"location": "examples/multi_metric/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multi_metric/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multi_metric/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/multiclass_classification/", "title": "Multiclass classification", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom sklearn.datasets import load_wine\nfrom atom import ATOMClassifier\n</pre> # Import packages from sklearn.datasets import load_wine from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX, y = load_wine(return_X_y=True, as_frame=True)\n\n# Let's have a look\nX.head()\n</pre> # Load data X, y = load_wine(return_X_y=True, as_frame=True)  # Let's have a look X.head() Out[2]: alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue od280/od315_of_diluted_wines proline 0 14.23 1.71 2.43 15.6 127.0 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065.0 1 13.20 1.78 2.14 11.2 100.0 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050.0 2 13.16 2.36 2.67 18.6 101.0 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185.0 3 14.37 1.95 2.50 16.8 113.0 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480.0 4 13.24 2.59 2.87 21.0 118.0 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735.0 In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X, y, n_jobs=-1, verbose=2, random_state=1)\n\n# Fit the pipeline with the selected models\natom.run(\n    models=[\"LR\",\"LDA\", \"RF\"],\n    metric=\"roc_auc_ovr\",\n    n_trials=14,\n    n_bootstrap=5,\n    errors=\"raise\",\n)\n</pre> atom = ATOMClassifier(X, y, n_jobs=-1, verbose=2, random_state=1)  # Fit the pipeline with the selected models atom.run(     models=[\"LR\",\"LDA\", \"RF\"],     metric=\"roc_auc_ovr\",     n_trials=14,     n_bootstrap=5,     errors=\"raise\", ) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Multiclass classification.\nParallel processing with 16 cores.\nParallelization backend: loky\n\nDataset stats ==================== &gt;&gt;\nShape: (178, 14)\nTrain set size: 143\nTest set size: 35\n-------------------------------------\nMemory: 19.36 kB\nScaled: False\nOutlier values: 12 (0.6%)\n\n\nTraining ========================= &gt;&gt;\nModels: LR, LDA, RF\nMetric: roc_auc_ovr\n\n\nRunning hyperparameter tuning for LogisticRegression...\n| trial | penalty |       C |  solver | max_iter | l1_ratio | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |\n| ----- | ------- | ------- | ------- | -------- | -------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |      l1 |  0.0054 |    saga |      480 |      0.7 |         0.5 |              0.5 |    10.567s | 10.567s | COMPLETE |\n| 1     |      l1 |   0.122 |    saga |      380 |      0.7 |      0.9951 |           0.9951 |    11.247s | 21.814s | COMPLETE |\n| 2     |      l2 |  0.0071 |     sag |      720 |      0.3 |         1.0 |              1.0 |    12.060s | 33.874s | COMPLETE |\n| 3     |      l1 | 87.9641 | libli.. |      920 |      0.3 |         1.0 |              1.0 |    10.158s | 44.032s | COMPLETE |\n| 4     |      l2 |  0.0114 |     sag |      630 |      0.7 |         1.0 |              1.0 |     7.990s | 52.022s | COMPLETE |\n| 5     |      l2 |  0.0018 |     sag |      920 |      0.1 |         1.0 |              1.0 |    11.685s | 01m:04s | COMPLETE |\n| 6     |      l2 | 43.4053 |     sag |      780 |      0.3 |         1.0 |              1.0 |     8.361s | 01m:12s | COMPLETE |\n| 7     |      l2 |  2.0759 | libli.. |      470 |      0.2 |         1.0 |              1.0 |     8.213s | 01m:20s | COMPLETE |\n| 8     |    None |   0.043 |     sag |      110 |      1.0 |         1.0 |              1.0 |     7.450s | 01m:28s | COMPLETE |\n| 9     |      l1 | 46.0233 |    saga |      740 |      0.1 |         1.0 |              1.0 |     7.951s | 01m:36s | COMPLETE |\n| 10    |      l2 |  0.4557 |   lbfgs |      280 |      0.5 |         1.0 |              1.0 |     8.807s | 01m:44s | COMPLETE |\n| 11    |      l2 |  0.0013 | libli.. |      940 |      0.4 |         1.0 |              1.0 |     7.970s | 01m:52s | COMPLETE |\n| 12    |      l2 |  4.8717 | newto.. |      780 |      0.3 |         1.0 |              1.0 |     8.202s | 02m:01s | COMPLETE |\n| 13    |      l2 |  0.0324 | libli.. |     1000 |      0.0 |         1.0 |              1.0 |     7.676s | 02m:08s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 2\nBest parameters:\n --&gt; penalty: l2\n --&gt; C: 0.0071\n --&gt; solver: sag\n --&gt; max_iter: 720\n --&gt; l1_ratio: 0.3\nBest evaluation --&gt; roc_auc_ovr: 1.0\nTime elapsed: 02m:08s\nFit ---------------------------------------------\nTrain evaluation --&gt; roc_auc_ovr: 0.9991\nTest evaluation --&gt; roc_auc_ovr: 0.9977\nTime elapsed: 0.542s\nBootstrap ---------------------------------------\nEvaluation --&gt; roc_auc_ovr: 0.9984 \u00b1 0.001\nTime elapsed: 0.603s\n-------------------------------------------------\nTime: 02m:09s\n\n\nRunning hyperparameter tuning for LinearDiscriminantAnalysis...\n| trial |  solver | shrinkage | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |\n| ----- | ------- | --------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |    lsqr |       0.9 |      0.9221 |           0.9221 |     0.048s |  0.048s | COMPLETE |\n| 1     |   eigen |       1.0 |      0.9121 |           0.9221 |     0.027s |  0.074s | COMPLETE |\n| 2     |   eigen |       1.0 |      0.9121 |           0.9221 |     0.001s |  0.075s | COMPLETE |\n| 3     |    lsqr |       0.7 |      0.8638 |           0.9221 |     0.025s |  0.100s | COMPLETE |\n| 4     |   eigen |       0.7 |      0.9019 |           0.9221 |     0.024s |  0.124s | COMPLETE |\n| 5     |    lsqr |      auto |         1.0 |              1.0 |     0.025s |  0.149s | COMPLETE |\n| 6     |   eigen |       1.0 |      0.9121 |              1.0 |     0.000s |  0.149s | COMPLETE |\n| 7     |    lsqr |       1.0 |      0.9445 |              1.0 |     0.026s |  0.175s | COMPLETE |\n| 8     |     svd |      None |         1.0 |              1.0 |     0.025s |  0.200s | COMPLETE |\n| 9     |     svd |      None |         1.0 |              1.0 |     0.001s |  0.201s | COMPLETE |\n| 10    |    lsqr |      auto |         1.0 |              1.0 |     0.002s |  0.203s | COMPLETE |\n| 11    |     svd |      None |         1.0 |              1.0 |     0.002s |  0.205s | COMPLETE |\n| 12    |     svd |      None |         1.0 |              1.0 |     0.001s |  0.206s | COMPLETE |\n| 13    |     svd |      None |         1.0 |              1.0 |     0.001s |  0.207s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 5\nBest parameters:\n --&gt; solver: lsqr\n --&gt; shrinkage: auto\nBest evaluation --&gt; roc_auc_ovr: 1.0\nTime elapsed: 0.207s\nFit ---------------------------------------------\nTrain evaluation --&gt; roc_auc_ovr: 1.0\nTest evaluation --&gt; roc_auc_ovr: 1.0\nTime elapsed: 0.025s\nBootstrap ---------------------------------------\nEvaluation --&gt; roc_auc_ovr: 0.9998 \u00b1 0.0005\nTime elapsed: 0.038s\n-------------------------------------------------\nTime: 0.271s\n\n\nRunning hyperparameter tuning for RandomForest...\n| trial | n_estimators | criterion | max_depth | min_samples_split | min_samples_leaf | max_features | bootstrap | max_samples | ccp_alpha | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |\n| ----- | ------------ | --------- | --------- | ----------------- | ---------------- | ------------ | --------- | ----------- | --------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |          210 |      gini |        10 |                17 |               20 |          0.5 |     False |        None |       0.0 |      0.9803 |           0.9803 |     0.249s |  0.249s | COMPLETE |\n| 1     |          380 |      gini |         4 |                15 |                3 |          0.9 |     False |        None |      0.01 |      0.9816 |           0.9816 |     0.456s |  0.705s | COMPLETE |\n| 2     |          380 |   entropy |         6 |                 2 |               13 |          0.9 |     False |        None |      0.03 |      0.9944 |           0.9944 |     0.502s |  1.206s | COMPLETE |\n| 3     |          470 |      gini |        11 |                 9 |               18 |          nan |      True |         0.6 |     0.025 |      0.9569 |           0.9944 |     9.106s | 10.312s | COMPLETE |\n| 4     |          100 |   entropy |        12 |                14 |                6 |          0.9 |     False |         nan |     0.035 |         1.0 |              1.0 |     8.530s | 18.842s | COMPLETE |\n| 5     |          470 |   entropy |        13 |                11 |                1 |          nan |      True |         0.6 |      0.01 |         1.0 |              1.0 |     1.391s | 20.233s | COMPLETE |\n| 6     |          250 |      gini |        14 |                13 |               17 |          0.7 |      True |         nan |      0.02 |         1.0 |              1.0 |     0.754s | 20.987s | COMPLETE |\n| 7     |          220 |      gini |         5 |                10 |                7 |          0.5 |      True |         0.9 |     0.035 |      0.9981 |              1.0 |     0.712s | 21.699s | COMPLETE |\n| 8     |          130 |   entropy |         4 |                 6 |               11 |          0.9 |     False |         nan |      0.03 |         1.0 |              1.0 |     0.532s | 22.231s | COMPLETE |\n| 9     |          370 |      gini |        12 |                 2 |                4 |          0.5 |     False |         nan |      0.02 |      0.9916 |              1.0 |     0.823s | 23.055s | COMPLETE |\n| 10    |           10 |   entropy |        12 |                20 |                7 |         log2 |     False |         nan |     0.035 |         1.0 |              1.0 |     0.522s | 23.577s | COMPLETE |\n| 11    |           70 |   entropy |        13 |                12 |                1 |         None |      True |         0.5 |      0.01 |      0.9928 |              1.0 |     0.614s | 24.191s | COMPLETE |\n| 12    |          500 |   entropy |         9 |                 7 |                7 |          0.6 |      True |         0.6 |      0.01 |         1.0 |              1.0 |     1.139s | 25.330s | COMPLETE |\n| 13    |          140 |   entropy |        16 |                16 |                1 |          0.8 |      True |         0.7 |       0.0 |         1.0 |              1.0 |     0.750s | 26.080s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 4\nBest parameters:\n --&gt; n_estimators: 100\n --&gt; criterion: entropy\n --&gt; max_depth: 12\n --&gt; min_samples_split: 14\n --&gt; min_samples_leaf: 6\n --&gt; max_features: 0.9\n --&gt; bootstrap: False\n --&gt; max_samples: None\n --&gt; ccp_alpha: 0.035\nBest evaluation --&gt; roc_auc_ovr: 1.0\nTime elapsed: 26.080s\nFit ---------------------------------------------\nTrain evaluation --&gt; roc_auc_ovr: 0.9993\nTest evaluation --&gt; roc_auc_ovr: 1.0\nTime elapsed: 0.737s\nBootstrap ---------------------------------------\nEvaluation --&gt; roc_auc_ovr: 0.9936 \u00b1 0.0067\nTime elapsed: 0.721s\n-------------------------------------------------\nTime: 27.539s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 02m:40s\n-------------------------------------\nLogisticRegression         --&gt; roc_auc_ovr: 0.9984 \u00b1 0.001\nLinearDiscriminantAnalysis --&gt; roc_auc_ovr: 0.9998 \u00b1 0.0005 !\nRandomForest               --&gt; roc_auc_ovr: 0.9936 \u00b1 0.0067\n</pre> In\u00a0[4]: Copied! <pre>atom.results\n</pre> atom.results Out[4]: roc_auc_ovr_ht time_ht roc_auc_ovr_train roc_auc_ovr_test time_fit roc_auc_ovr_bootstrap time_bootstrap time LR 1.0 128.337325 0.9979 0.9977 0.542487 0.998413 0.602810 129.482622 LDA 1.0 0.207456 1.0000 0.9989 0.025409 0.999773 0.038035 0.270900 RF 1.0 26.080413 0.9951 0.9919 0.737324 0.993613 0.721398 27.539135 In\u00a0[5]: Copied! <pre># Show the score for some different metrics\natom.evaluate([\"precision_macro\", \"recall_macro\", \"jaccard_weighted\"])\n</pre> # Show the score for some different metrics atom.evaluate([\"precision_macro\", \"recall_macro\", \"jaccard_weighted\"]) Out[5]: precision_macro recall_macro jaccard_weighted LR 0.9429 0.9484 0.8924 LDA 0.9667 0.9762 0.9457 RF 0.8799 0.8915 0.7968 In\u00a0[10]: Copied! <pre># Some plots allow you to choose the target class to look at\natom.rf.plot_probabilities(rows=\"train\", target=0)\n</pre> # Some plots allow you to choose the target class to look at atom.rf.plot_probabilities(rows=\"train\", target=0) In\u00a0[8]: Copied! <pre>atom.lda.plot_shap_heatmap(target=2, show=7)\n</pre> atom.lda.plot_shap_heatmap(target=2, show=7)"}, {"location": "examples/multiclass_classification/#example-multiclass-classification", "title": "Example: Multiclass classification\u00b6", "text": "<p>This example shows how to compare the performance of three models on a multiclass classification task.</p> <p>Import the wine dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict wines into three groups (which cultivator it's from) using features based on the results of chemical analysis.</p>"}, {"location": "examples/multiclass_classification/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multiclass_classification/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multiclass_classification/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/multilabel_classification/", "title": "Multilabel classification", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import make_multilabel_classification\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier from sklearn.datasets import make_multilabel_classification In\u00a0[2]: Copied! <pre># Create data\nX, y = make_multilabel_classification(n_samples=300, n_classes=3, random_state=1)\n</pre> # Create data X, y = make_multilabel_classification(n_samples=300, n_classes=3, random_state=1) In\u00a0[3]: Copied! <pre># Note that for multioutput tasks, you must specify the `y` keyword\natom = ATOMClassifier(X, y=y, verbose=2, random_state=1)\n</pre> # Note that for multioutput tasks, you must specify the `y` keyword atom = ATOMClassifier(X, y=y, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Multilabel classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (300, 23)\nTrain set size: 240\nTest set size: 60\n-------------------------------------\nMemory: 51.73 kB\nScaled: False\nOutlier values: 29 (0.5%)\n\n</pre> In\u00a0[4]: Copied! <pre># Show the models that natively support multilabel tasks\natom.available_models()[[\"acronym\", \"model\", \"native_multilabel\"]]\n</pre> # Show the models that natively support multilabel tasks atom.available_models()[[\"acronym\", \"model\", \"native_multilabel\"]] Out[4]: acronym model native_multilabel 0 AdaB AdaBoost False 1 Bag Bagging False 2 BNB BernoulliNB False 3 CatB CatBoost False 4 CatNB CategoricalNB False 5 CNB ComplementNB False 6 Tree DecisionTree True 7 Dummy Dummy False 8 ETree ExtraTree True 9 ET ExtraTrees True 10 GNB GaussianNB False 11 GP GaussianProcess False 12 GBM GradientBoostingMachine False 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 LGB LightGBM False 16 LDA LinearDiscriminantAnalysis False 17 lSVM LinearSVM False 18 LR LogisticRegression False 19 MLP MultiLayerPerceptron True 20 MNB MultinomialNB False 21 PA PassiveAggressive False 22 Perc Perceptron False 23 QDA QuadraticDiscriminantAnalysis False 24 RNN RadiusNearestNeighbors True 25 RF RandomForest True 26 Ridge Ridge True 27 SGD StochasticGradientDescent False 28 SVM SupportVectorMachine False 29 XGB XGBoost False In\u00a0[5]: Copied! <pre>atom.run(models=[\"LDA\", \"RF\"], metric=\"recall_weighted\")\n</pre> atom.run(models=[\"LDA\", \"RF\"], metric=\"recall_weighted\") <pre>\nTraining ========================= &gt;&gt;\nModels: LDA, RF\nMetric: recall_weighted\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; recall_weighted: 0.9124\nTest evaluation --&gt; recall_weighted: 0.8351\nTime elapsed: 0.037s\n-------------------------------------------------\nTime: 0.037s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; recall_weighted: 1.0\nTest evaluation --&gt; recall_weighted: 0.8763\nTime elapsed: 0.170s\n-------------------------------------------------\nTime: 0.170s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.269s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; recall_weighted: 0.8351\nRandomForest               --&gt; recall_weighted: 0.8763 !\n</pre> In\u00a0[6]: Copied! <pre># Note that non-native multioutput models use a meta-estimator wrapper\nprint(f\"Estimator for LDA is: {atom.lda.estimator}\")\nprint(f\"Estimator for RF is: {atom.rf.estimator}\")\n</pre> # Note that non-native multioutput models use a meta-estimator wrapper print(f\"Estimator for LDA is: {atom.lda.estimator}\") print(f\"Estimator for RF is: {atom.rf.estimator}\") <pre>Estimator for LDA is: ClassifierChain(base_estimator=LinearDiscriminantAnalysis(), random_state=1)\nEstimator for RF is: RandomForestClassifier(n_jobs=1, random_state=1)\n</pre> In\u00a0[7]: Copied! <pre>from atom import ATOMModel\nfrom sklearn.multioutput import ClassifierChain\nfrom sklearn.linear_model import LogisticRegression\nfrom optuna.distributions import CategoricalDistribution, IntDistribution\n\ncustom_model = ATOMModel(\n    estimator=ClassifierChain(LogisticRegression(), cv=3),\n    name=\"chain\",\n    needs_scaling=True,\n    native_multilabel=True,\n)\n\natom.run(\n    models=custom_model,\n    n_trials=5,\n    ht_params={\n        \"distributions\": {\n            \"order\": CategoricalDistribution([[0, 1, 2], [2, 1, 0], [1, 2, 0]]),\n            \"base_estimator__max_iter\": IntDistribution(100, 200, step=10),\n            \"base_estimator__solver\": CategoricalDistribution([\"lbfgs\", \"newton-cg\"]),            \n        }\n    },\n)\n</pre> from atom import ATOMModel from sklearn.multioutput import ClassifierChain from sklearn.linear_model import LogisticRegression from optuna.distributions import CategoricalDistribution, IntDistribution  custom_model = ATOMModel(     estimator=ClassifierChain(LogisticRegression(), cv=3),     name=\"chain\",     needs_scaling=True,     native_multilabel=True, )  atom.run(     models=custom_model,     n_trials=5,     ht_params={         \"distributions\": {             \"order\": CategoricalDistribution([[0, 1, 2], [2, 1, 0], [1, 2, 0]]),             \"base_estimator__max_iter\": IntDistribution(100, 200, step=10),             \"base_estimator__solver\": CategoricalDistribution([\"lbfgs\", \"newton-cg\"]),                     }     }, ) <pre>\nTraining ========================= &gt;&gt;\nModels: chain\nMetric: recall_weighted\n\n\nRunning hyperparameter tuning for ClassifierChain...\n| trial |     order | base_estimator__max_iter | base_estimator__solver | recall_weighted | best_recall_weighted | time_trial | time_ht |    state |\n| ----- | --------- | ------------------------ | ---------------------- | --------------- | -------------------- | ---------- | ------- | -------- |\n| 0     | [2, 1, 0] |                      130 |                  lbfgs |          0.8831 |               0.8831 |     2.813s |  2.813s | COMPLETE |\n| 1     | [1, 2, 0] |                      150 |              newton-cg |          0.9091 |               0.9091 |     2.184s |  4.997s | COMPLETE |\n| 2     | [2, 1, 0] |                      170 |              newton-cg |          0.8701 |               0.9091 |     0.085s |  5.082s | COMPLETE |\n| 3     | [1, 2, 0] |                      200 |              newton-cg |          0.9221 |               0.9221 |     0.084s |  5.166s | COMPLETE |\n| 4     | [2, 1, 0] |                      100 |              newton-cg |          0.8701 |               0.9221 |     0.078s |  5.244s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 3\nBest parameters:\n --&gt; order: [1, 2, 0]\n --&gt; base_estimator__max_iter: 200\n --&gt; base_estimator__solver: newton-cg\nBest evaluation --&gt; recall_weighted: 0.9221\nTime elapsed: 5.244s\nFit ---------------------------------------------\nTrain evaluation --&gt; recall_weighted: 0.9021\nTest evaluation --&gt; recall_weighted: 0.866\nTime elapsed: 0.101s\n-------------------------------------------------\nTime: 5.345s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.397s\n-------------------------------------\nClassifierChain --&gt; recall_weighted: 0.866\n</pre> In\u00a0[8]: Copied! <pre>thresholds = atom.rf.get_best_threshold()\nprint(f\"Best threshold per target column: {thresholds}\")\n</pre> thresholds = atom.rf.get_best_threshold() print(f\"Best threshold per target column: {thresholds}\") <pre>Best threshold per target column: [0.7, 0.69, 0.63]\n</pre> In\u00a0[9]: Copied! <pre>atom.rf.evaluate(threshold=thresholds)\n</pre> atom.rf.evaluate(threshold=thresholds) Out[9]: <pre>accuracy              0.5667\nap                    0.8893\nf1_weighted           0.7274\njaccard_weighted      0.6271\nprecision_weighted    0.8269\nrecall_weighted       0.6495\nauc                   0.9213\nName: RF, dtype: float64</pre> In\u00a0[10]: Copied! <pre># Use the target parameter in plots to specify which target column to use\natom.plot_roc(target=2)\n</pre> # Use the target parameter in plots to specify which target column to use atom.plot_roc(target=2) In\u00a0[11]: Copied! <pre># When the target parameter also specifies the class, use format (column, class)\natom.plot_probabilities(models=\"chain\", target=(2, 1))\n</pre> # When the target parameter also specifies the class, use format (column, class) atom.plot_probabilities(models=\"chain\", target=(2, 1)) In\u00a0[12]: Copied! <pre>with atom.canvas(figsize=(900, 600)):\n    atom.plot_calibration(target=0)\n    atom.plot_calibration(target=1)\n</pre> with atom.canvas(figsize=(900, 600)):     atom.plot_calibration(target=0)     atom.plot_calibration(target=1)"}, {"location": "examples/multilabel_classification/#example-multilabel-classification", "title": "Example: Multilabel classification\u00b6", "text": "<p>This example shows how to use ATOM to solve a multilabel classification problem.</p> <p>The data used is a synthetic dataset created using sklearn's make_multilabel_classification function.</p>"}, {"location": "examples/multilabel_classification/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multilabel_classification/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multilabel_classification/#add-custom-multilabel-models", "title": "Add custom multilabel models\u00b6", "text": "<p>To use your own meta-estimator with custom parameters, add it as a custom model. It's also possible to tune the hyperparameters of this custom meta-estimator.</p>"}, {"location": "examples/multilabel_classification/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/multioutput_regression/", "title": "Multioutput regression", "text": "In\u00a0[1]: Copied! <pre># Disable annoying tf warnings\nimport os\nos.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"\n\nfrom tensorflow import get_logger\nget_logger().setLevel('ERROR')\n\nimport numpy as np\nfrom atom import ATOMRegressor, ATOMModel\nfrom sklearn.datasets import make_regression\n\nfrom scikeras.wrappers import KerasRegressor\nfrom keras.models import Sequential\nfrom keras.layers import Dense\n</pre> # Disable annoying tf warnings import os os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"  from tensorflow import get_logger get_logger().setLevel('ERROR')  import numpy as np from atom import ATOMRegressor, ATOMModel from sklearn.datasets import make_regression  from scikeras.wrappers import KerasRegressor from keras.models import Sequential from keras.layers import Dense In\u00a0[2]: Copied! <pre># Create data\nX, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=3)\n</pre> # Create data X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=3) In\u00a0[3]: Copied! <pre># Create the neural network\nclass NeuralNetwork(KerasRegressor):\n    \"\"\"Multioutput multilayer perceptron.\"\"\"\n\n    @staticmethod\n    def _keras_build_fn(n_inputs, n_outputs, **kwargs):\n        \"\"\"Create the model's architecture.\"\"\"\n        model = Sequential()\n        model.add(Dense(20, input_dim=n_inputs, activation=\"relu\"))\n        model.add(Dense(20, activation=\"relu\"))\n        model.add(Dense(n_outputs))\n        model.compile(loss=\"mse\", optimizer=\"adam\")\n        return model\n</pre> # Create the neural network class NeuralNetwork(KerasRegressor):     \"\"\"Multioutput multilayer perceptron.\"\"\"      @staticmethod     def _keras_build_fn(n_inputs, n_outputs, **kwargs):         \"\"\"Create the model's architecture.\"\"\"         model = Sequential()         model.add(Dense(20, input_dim=n_inputs, activation=\"relu\"))         model.add(Dense(20, activation=\"relu\"))         model.add(Dense(n_outputs))         model.compile(loss=\"mse\", optimizer=\"adam\")         return model In\u00a0[4]: Copied! <pre># Convert the model to an ATOM model\nmodel = ATOMModel(\n    estimator=NeuralNetwork(n_inputs=5, n_outputs=y.shape[1], epochs=100, verbose=0),\n    name=\"NN\",\n    needs_scaling=True,  # Applies automated feature scaling before fitting\n    native_multioutput=True,  # Do not use a multioutput meta-estimator wrapper\n)\n</pre> # Convert the model to an ATOM model model = ATOMModel(     estimator=NeuralNetwork(n_inputs=5, n_outputs=y.shape[1], epochs=100, verbose=0),     name=\"NN\",     needs_scaling=True,  # Applies automated feature scaling before fitting     native_multioutput=True,  # Do not use a multioutput meta-estimator wrapper ) In\u00a0[5]: Copied! <pre>atom = ATOMRegressor(X, y=y, verbose=2, random_state=1)\n</pre> atom = ATOMRegressor(X, y=y, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Multioutput regression.\n\nDataset stats ==================== &gt;&gt;\nShape: (1000, 13)\nTrain set size: 800\nTest set size: 200\n-------------------------------------\nMemory: 104.13 kB\nScaled: True\nOutlier values: 27 (0.3%)\n\n</pre> In\u00a0[6]: Copied! <pre># Show the models that natively support multioutput tasks\natom.available_models()[[\"acronym\", \"model\", \"native_multioutput\"]]\n</pre> # Show the models that natively support multioutput tasks atom.available_models()[[\"acronym\", \"model\", \"native_multioutput\"]] Out[6]: acronym model native_multioutput 0 AdaB AdaBoost False 1 ARD AutomaticRelevanceDetermination False 2 Bag Bagging False 3 BR BayesianRidge False 4 CatB CatBoost False 5 Tree DecisionTree True 6 Dummy Dummy False 7 EN ElasticNet False 8 ETree ExtraTree True 9 ET ExtraTrees True 10 GP GaussianProcess False 11 GBM GradientBoostingMachine False 12 Huber HuberRegression False 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 Lasso Lasso False 16 Lars LeastAngleRegression False 17 LGB LightGBM False 18 lSVM LinearSVM False 19 MLP MultiLayerPerceptron False 20 OLS OrdinaryLeastSquares False 21 OMP OrthogonalMatchingPursuit False 22 PA PassiveAggressive False 23 RNN RadiusNearestNeighbors True 24 RF RandomForest True 25 Ridge Ridge False 26 SGD StochasticGradientDescent False 27 SVM SupportVectorMachine False 28 XGB XGBoost False In\u00a0[7]: Copied! <pre># Note we only added 5 informative features to the dataset, let's remove the rest\n# If we use a model with no native support for multioutput as solver, specify the\n# rfe's importance_getter parameter and return the mean of the coefficients over the\n# target columns\natom.feature_selection(\n    strategy=\"rfe\",\n    solver=\"ols\",  # This becomes MultiOutputRegressor(OLS)\n    n_features=5,\n    importance_getter=lambda x: np.mean([e.coef_ for e in x.estimators_], axis=0),\n)\n</pre> # Note we only added 5 informative features to the dataset, let's remove the rest # If we use a model with no native support for multioutput as solver, specify the # rfe's importance_getter parameter and return the mean of the coefficients over the # target columns atom.feature_selection(     strategy=\"rfe\",     solver=\"ols\",  # This becomes MultiOutputRegressor(OLS)     n_features=5,     importance_getter=lambda x: np.mean([e.coef_ for e in x.estimators_], axis=0), ) <pre>Fitting FeatureSelector...\nPerforming feature selection ...\n --&gt; rfe selected 5 features from the dataset.\n   --&gt; Dropping feature x0 (rank 6).\n   --&gt; Dropping feature x5 (rank 5).\n   --&gt; Dropping feature x6 (rank 3).\n   --&gt; Dropping feature x7 (rank 2).\n   --&gt; Dropping feature x9 (rank 4).\n</pre> In\u00a0[8]: Copied! <pre># Let's train a native, non-native and our custom model\natom.run(models=[\"Lasso\", \"RF\", model], metric=\"mse\")\n</pre> # Let's train a native, non-native and our custom model atom.run(models=[\"Lasso\", \"RF\", model], metric=\"mse\") <pre>\nTraining ========================= &gt;&gt;\nModels: Lasso, RF, NN\nMetric: mse\n\n\nResults for Lasso:\nFit ---------------------------------------------\nTrain evaluation --&gt; mse: -5.1516\nTest evaluation --&gt; mse: -5.5774\nTime elapsed: 0.031s\n-------------------------------------------------\nTime: 0.031s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; mse: -200.7336\nTest evaluation --&gt; mse: -1494.3406\nTime elapsed: 0.706s\n-------------------------------------------------\nTime: 0.706s\n\n\nResults for NeuralNetwork:\nFit ---------------------------------------------\nTrain evaluation --&gt; mse: -111.3789\nTest evaluation --&gt; mse: -105.2649\nTime elapsed: 2.372s\n-------------------------------------------------\nTime: 2.372s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 3.116s\n-------------------------------------\nLasso         --&gt; mse: -5.5774 !\nRandomForest  --&gt; mse: -1494.3406 ~\nNeuralNetwork --&gt; mse: -105.2649\n</pre> In\u00a0[9]: Copied! <pre># And check which of the models used a meta-estimator wrapper\nfor m in atom.models:\n    print(f\"Estimator for {m} is: {atom[m].estimator}\")\n</pre> # And check which of the models used a meta-estimator wrapper for m in atom.models:     print(f\"Estimator for {m} is: {atom[m].estimator}\") <pre>Estimator for Lasso is: MultiOutputRegressor(estimator=Lasso(random_state=1), n_jobs=1)\nEstimator for RF is: RandomForestRegressor(n_jobs=1, random_state=1)\nEstimator for NN is: NeuralNetwork(\n\tmodel=None\n\tbuild_fn=None\n\twarm_start=False\n\trandom_state=1\n\toptimizer=rmsprop\n\tloss=None\n\tmetrics=None\n\tbatch_size=None\n\tvalidation_batch_size=None\n\tverbose=0\n\tcallbacks=None\n\tvalidation_split=0.0\n\tshuffle=True\n\trun_eagerly=False\n\tepochs=100\n\tn_inputs=5\n\tn_outputs=3\n\tname=NN\n\tneeds_scaling=True\n\tnative_multioutput=True\n\tnative_multilabel=False\n\thas_validation=None\n)\n</pre> In\u00a0[10]: Copied! <pre># Use the target parameter in plots to specify which target column to use\natom.plot_residuals(target=2)\n</pre> # Use the target parameter in plots to specify which target column to use atom.plot_residuals(target=2) In\u00a0[11]: Copied! <pre>with atom.canvas(3, 1, figsize=(900, 1300)):\n    atom.plot_errors(target=0)\n    atom.plot_errors(target=1)\n    atom.plot_errors(target=2)\n</pre> with atom.canvas(3, 1, figsize=(900, 1300)):     atom.plot_errors(target=0)     atom.plot_errors(target=1)     atom.plot_errors(target=2) <pre>\n---------------------------------------------------------------------------\nValueError                                Traceback (most recent call last)\nCell In[11], line 2\n      1 with atom.canvas(3, 1, figsize=(900, 1300)):\n----&gt; 2     atom.plot_errors(target=0)\n      3     atom.plot_errors(target=1)\n      4     atom.plot_errors(target=2)\n\nFile ~\\Documents\\Python\\ATOM\\atom\\utils\\utils.py:2712, in crash.&lt;locals&gt;.wrapper(*args, **kwargs)\n   2709     cache[\"last_exception\"] = ex\n   2710     args[0].logger.exception(\"Exception encountered:\")\n-&gt; 2712 raise ex\n\nFile ~\\Documents\\Python\\ATOM\\atom\\utils\\utils.py:2704, in crash.&lt;locals&gt;.wrapper(*args, **kwargs)\n   2701 @wraps(f)\n   2702 def wrapper(*args, **kwargs) -&gt; Any:\n   2703     try:  # Run the function\n-&gt; 2704         return f(*args, **kwargs)\n   2706     except Exception as ex:\n   2707         # If exception is not the same as last, write to log\n   2708         if ex is not cache[\"last_exception\"] and args[0].logger:\n\nFile ~\\Documents\\Python\\ATOM\\atom\\plots\\predictionplot.py:691, in PredictionPlot.plot_errors(self, models, rows, target, title, legend, figsize, filename, display)\n    689         from atom.models import OrdinaryLeastSquares\n    690         model = OrdinaryLeastSquares(goal=self.task.goal, branches=self._branches)\n--&gt; 691         estimator = model._get_est().fit(bk.DataFrame(y_true), y_pred)\n    693         fig.add_trace(\n    694             self._draw_line(\n    695                 x=(x := np.linspace(y_true.min(), y_true.max(), 100)),\n   (...)\n    703             )\n    704         )\n    706 self._draw_straight_line(y=\"diagonal\", xaxis=xaxis, yaxis=yaxis)\n\nFile ~\\Documents\\Python\\ATOM\\venv310\\lib\\site-packages\\sklearn\\base.py:1152, in _fit_context.&lt;locals&gt;.decorator.&lt;locals&gt;.wrapper(estimator, *args, **kwargs)\n   1145     estimator._validate_params()\n   1147 with config_context(\n   1148     skip_parameter_validation=(\n   1149         prefer_skip_nested_validation or global_skip_validation\n   1150     )\n   1151 ):\n-&gt; 1152     return fit_method(estimator, *args, **kwargs)\n\nFile ~\\Documents\\Python\\ATOM\\venv310\\lib\\site-packages\\sklearn\\multioutput.py:248, in _MultiOutputEstimator.fit(self, X, y, sample_weight, **fit_params)\n    245     check_classification_targets(y)\n    247 if y.ndim == 1:\n--&gt; 248     raise ValueError(\n    249         \"y must have at least two dimensions for \"\n    250         \"multi-output regression but has only one.\"\n    251     )\n    253 if _routing_enabled():\n    254     routed_params = process_routing(\n    255         obj=self,\n    256         method=\"fit\",\n    257         other_params=fit_params,\n    258         sample_weight=sample_weight,\n    259     )\n\nValueError: y must have at least two dimensions for multi-output regression but has only one.</pre>"}, {"location": "examples/multioutput_regression/#example-multioutput-regression", "title": "Example: Multioutput regression\u00b6", "text": "<p>This example shows how to use ATOM to make preditions on a multioutput regression dataset. One of the models used is a MLP regressor implemented with Keras using scikeras.</p> <p>The data used is a synthetic dataset created using sklearn's make_regression function.</p>"}, {"location": "examples/multioutput_regression/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multioutput_regression/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multioutput_regression/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/nlp/", "title": "NLP", "text": "In\u00a0[1]: Copied! <pre>import numpy as np\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import fetch_20newsgroups\n</pre> import numpy as np from atom import ATOMClassifier from sklearn.datasets import fetch_20newsgroups In\u00a0[2]: Copied! <pre># Use only a subset of the available topics for faster processing\nX_text, y_text = fetch_20newsgroups(\n    return_X_y=True,\n    categories=[\n        'sci.med',\n        'comp.windows.x',\n        'misc.forsale',\n        'rec.autos',\n    ],\n    shuffle=True,\n    random_state=1,\n)\nX_text = np.array(X_text).reshape(-1, 1)\n</pre> # Use only a subset of the available topics for faster processing X_text, y_text = fetch_20newsgroups(     return_X_y=True,     categories=[         'sci.med',         'comp.windows.x',         'misc.forsale',         'rec.autos',     ],     shuffle=True,     random_state=1, ) X_text = np.array(X_text).reshape(-1, 1) In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X_text, y_text, index=True, test_size=0.3, verbose=2, random_state=1)\n</pre> atom = ATOMClassifier(X_text, y_text, index=True, test_size=0.3, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Multiclass classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (2366, 2)\nTrain set size: 1657\nTest set size: 709\n-------------------------------------\nMemory: 122.87 kB\nScaled: False\nCategorical features: 1 (100.0%)\n\n</pre> In\u00a0[4]: Copied! <pre>atom.dataset  # Note that the feature is automatically named 'corpus'\n</pre> atom.dataset  # Note that the feature is automatically named 'corpus' Out[4]: corpus target 1731 From: rlm@helen.surfcty.com (Robert L. McMilli... 0 1496 From: carl@SOL1.GPS.CALTECH.EDU (Carl J Lydick... 3 1290 From: thssjxy@iitmax.iit.edu (Smile)\\nSubject:... 1 2021 From: c23st@kocrsv01.delcoelect.com (Spiros Tr... 2 142 From: ginkgo@ecsvax.uncecs.edu (J. Geary Morto... 1 ... ... ... 510 From: mary@uicsl.csl.uiuc.edu (Mary E. Allison... 3 1948 From: ndd@sunbar.mc.duke.edu (Ned Danieley)\\nS... 0 798 From: kk@unisql.UUCP (Kerry Kimbrough)\\nSubjec... 0 2222 From: hamachi@adobe.com (Gordon Hamachi)\\nSubj... 2 2215 From: mobasser@vu-vlsi.ee.vill.edu (Bijan Moba... 2 <p>2366 rows \u00d7 2 columns</p> In\u00a0[5]: Copied! <pre># Let's have a look at the first document\natom.corpus[0]\n</pre> # Let's have a look at the first document atom.corpus[0] Out[5]: <pre>'From: caf@omen.UUCP (Chuck Forsberg WA7KGX)\\nSubject: Re: My New Diet --&gt; IT WORKS GREAT !!!!\\nOrganization: Omen Technology INC, Portland Rain Forest\\nLines: 32\\n\\nIn article &lt;1qk6v3INNrm6@lynx.unm.edu&gt; bhjelle@carina.unm.edu () writes:\\n&gt;\\n&gt;Gordon Banks:\\n&gt;\\n&gt;&gt;a lot to keep from going back to morbid obesity.  I think all\\n&gt;&gt;of us cycle.  One\\'s success depends on how large the fluctuations\\n&gt;&gt;in the cycle are.  Some people can cycle only 5 pounds.  Unfortunately,\\n&gt;&gt;I\\'m not one of them.\\n&gt;&gt;\\n&gt;&gt;\\n&gt;This certainly describes my situation perfectly. For me there is\\n&gt;a constant dynamic between my tendency to eat, which appears to\\n&gt;be totally limitless, and the purely conscious desire to not\\n&gt;put on too much weight. When I get too fat, I just diet/exercise\\n&gt;more (with varying degrees of success) to take off the\\n&gt;extra weight. Usually I cycle within a 15 lb range, but\\n&gt;smaller and larger cycles occur as well. I\\'m always afraid\\n&gt;that this method will stop working someday, but usually\\n&gt;I seem to be able to hold the weight gain in check.\\n&gt;This is one reason I have a hard time accepting the notion\\n&gt;of some metabolic derangement associated with cycle dieting\\n&gt;(that results in long-term weight gain). I have been cycle-\\n&gt;dieting for at least 20 years without seeing such a change.\\n\\nAs mentioned in Adiposity 101, only some experience weight\\nrebound.  The fact that you don\\'t doesn\\'t prove it doesn\\'t\\nhappen to others.\\n-- \\nChuck Forsberg WA7KGX          ...!tektronix!reed!omen!caf \\nAuthor of YMODEM, ZMODEM, Professional-YAM, ZCOMM, and DSZ\\n  Omen Technology Inc    \"The High Reliability Software\"\\n17505-V NW Sauvie IS RD   Portland OR 97231   503-621-3406\\n'</pre> In\u00a0[6]: Copied! <pre># Clean the documents from noise (emails, numbers, etc...)\natom.textclean()\n</pre> # Clean the documents from noise (emails, numbers, etc...) atom.textclean() <pre>Fitting TextCleaner...\nCleaning the corpus...\n --&gt; Decoding unicode characters to ascii.\n --&gt; Converting text to lower case.\n --&gt; Dropping emails from documents.\n --&gt; Dropping URL links from documents.\n --&gt; Dropping HTML tags from documents.\n --&gt; Dropping emojis from documents.\n --&gt; Dropping numbers from documents.\n --&gt; Dropping punctuation from the text.\n</pre> In\u00a0[7]: Copied! <pre># Check how the first document changed\natom.corpus[0]\n</pre> # Check how the first document changed atom.corpus[0] Out[7]: <pre>'from  chuck forsberg wa7kgx\\nsubject re my new diet  it works great \\norganization omen technology inc portland rain forest\\nlines \\n\\nin article    writes\\n\\ngordon banks\\n\\na lot to keep from going back to morbid obesity  i think all\\nof us cycle  ones success depends on how large the fluctuations\\nin the cycle are  some people can cycle only  pounds  unfortunately\\nim not one of them\\n\\n\\nthis certainly describes my situation perfectly for me there is\\na constant dynamic between my tendency to eat which appears to\\nbe totally limitless and the purely conscious desire to not\\nput on too much weight when i get too fat i just dietexercise\\nmore with varying degrees of success to take off the\\nextra weight usually i cycle within a  lb range but\\nsmaller and larger cycles occur as well im always afraid\\nthat this method will stop working someday but usually\\ni seem to be able to hold the weight gain in check\\nthis is one reason i have a hard time accepting the notion\\nof some metabolic derangement associated with cycle dieting\\nthat results in longterm weight gain i have been cycle\\ndieting for at least  years without seeing such a change\\n\\nas mentioned in adiposity  only some experience weight\\nrebound  the fact that you dont doesnt prove it doesnt\\nhappen to others\\n \\nchuck forsberg wa7kgx          tektronixreedomencaf \\nauthor of ymodem zmodem professionalyam zcomm and dsz\\n  omen technology inc    the high reliability software\\nv nw sauvie is rd   portland or    \\n'</pre> In\u00a0[8]: Copied! <pre># Convert the strings to a sequence of words\natom.tokenize()\n</pre> # Convert the strings to a sequence of words atom.tokenize() <pre>Fitting Tokenizer...\nTokenizing the corpus...\n</pre> In\u00a0[9]: Copied! <pre># Print the first few words of the first document\natom.corpus[0][:7]\n</pre> # Print the first few words of the first document atom.corpus[0][:7] Out[9]: <pre>['from', 'chuck', 'forsberg', 'wa7kgx', 'subject', 're', 'my']</pre> In\u00a0[10]: Copied! <pre># Normalize the text to a predefined standard\natom.textnormalize(stopwords=\"english\", lemmatize=True)\n</pre> # Normalize the text to a predefined standard atom.textnormalize(stopwords=\"english\", lemmatize=True) <pre>Fitting TextNormalizer...\nNormalizing the corpus...\n --&gt; Dropping stopwords.\n --&gt; Applying lemmatization.\n</pre> In\u00a0[11]: Copied! <pre>atom.corpus[0][:7]  # Check changes...\n</pre> atom.corpus[0][:7]  # Check changes... Out[11]: <pre>['chuck', 'forsberg', 'wa7kgx', 'subject', 'new', 'diet', 'work']</pre> In\u00a0[12]: Copied! <pre># Visualize the most common words with a wordcloud\natom.plot_wordcloud(figsize=(700, 500))\n</pre> # Visualize the most common words with a wordcloud atom.plot_wordcloud(figsize=(700, 500)) In\u00a0[13]: Copied! <pre># Have a look at the most frequent bigrams\natom.plot_ngrams(2)\n</pre> # Have a look at the most frequent bigrams atom.plot_ngrams(2) In\u00a0[14]: Copied! <pre># Create the bigrams using the tokenizer\natom.tokenize(bigram_freq=215)\n</pre> # Create the bigrams using the tokenizer atom.tokenize(bigram_freq=215) <pre>Fitting Tokenizer...\nTokenizing the corpus...\n --&gt; Creating 7 bigrams on 3128 locations.\n</pre> In\u00a0[15]: Copied! <pre>atom.bigrams_\n</pre> atom.bigrams_ Out[15]: bigram frequency 0 x_x 1168 1 line_article 532 2 line_nntppostinghost 389 3 organization_university 331 4 gordon_bank 266 5 distribution_usa 227 6 line_distribution 215 In\u00a0[16]: Copied! <pre># As a last step before modelling, convert the words to vectors\natom.vectorize(strategy=\"tfidf\")\n</pre> # As a last step before modelling, convert the words to vectors atom.vectorize(strategy=\"tfidf\") <pre>Fitting Vectorizer...\nVectorizing the corpus...\n</pre> In\u00a0[17]: Copied! <pre># The dimensionality of the dataset has increased a lot!\natom.shape\n</pre> # The dimensionality of the dataset has increased a lot! atom.shape Out[17]: <pre>(2366, 24176)</pre> In\u00a0[18]: Copied! <pre># Note that the data is sparse and the columns are named\n# after the words they are embedding\natom.dtypes\n</pre> # Note that the data is sparse and the columns are named # after the words they are embedding atom.dtypes Out[18]: <pre>corpus_000000e5    Sparse[float64, 0]\ncorpus_00000ee5    Sparse[float64, 0]\ncorpus_000010af    Sparse[float64, 0]\ncorpus_0007259d    Sparse[float64, 0]\ncorpus_00072a27    Sparse[float64, 0]\n                          ...        \ncorpus_zurich      Sparse[float64, 0]\ncorpus_zvi         Sparse[float64, 0]\ncorpus_zx          Sparse[float64, 0]\ncorpus_zz          Sparse[float64, 0]\ntarget                          int64\nLength: 24176, dtype: object</pre> In\u00a0[19]: Copied! <pre># When the dataset is sparse, stats() shows the density\natom.stats()\n</pre> # When the dataset is sparse, stats() shows the density atom.stats() <pre>Dataset stats ==================== &gt;&gt;\nShape: (2366, 24176)\nTrain set size: 1657\nTest set size: 709\n-------------------------------------\nMemory: 2.54 MB\nSparse: True\nDensity: 0.35%\n</pre> In\u00a0[20]: Copied! <pre># Check which models have support for sparse matrices\natom.available_models()[[\"acronym\", \"model\", \"accepts_sparse\"]]\n</pre> # Check which models have support for sparse matrices atom.available_models()[[\"acronym\", \"model\", \"accepts_sparse\"]] Out[20]: acronym model accepts_sparse 0 AdaB AdaBoost True 1 Bag Bagging True 2 BNB BernoulliNB True 3 CatB CatBoost True 4 CatNB CategoricalNB True 5 CNB ComplementNB True 6 Tree DecisionTree True 7 Dummy Dummy False 8 ETree ExtraTree True 9 ET ExtraTrees True 10 GNB GaussianNB False 11 GP GaussianProcess False 12 GBM GradientBoostingMachine True 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 LGB LightGBM True 16 LDA LinearDiscriminantAnalysis False 17 lSVM LinearSVM True 18 LR LogisticRegression True 19 MLP MultiLayerPerceptron True 20 MNB MultinomialNB True 21 PA PassiveAggressive True 22 Perc Perceptron False 23 QDA QuadraticDiscriminantAnalysis False 24 RNN RadiusNearestNeighbors True 25 RF RandomForest True 26 Ridge Ridge True 27 SGD StochasticGradientDescent True 28 SVM SupportVectorMachine True 29 XGB XGBoost True In\u00a0[21]: Copied! <pre># Train the model\natom.run(models=\"RF\", metric=\"f1_weighted\")\n</pre> # Train the model atom.run(models=\"RF\", metric=\"f1_weighted\") <pre>\nTraining ========================= &gt;&gt;\nModels: RF\nMetric: f1_weighted\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1_weighted: 1.0\nTest evaluation --&gt; f1_weighted: 0.9181\nTime elapsed: 02m:24s\n-------------------------------------------------\nTime: 02m:24s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 02m:24s\n-------------------------------------\nRandomForest --&gt; f1_weighted: 0.9181\n</pre> In\u00a0[22]: Copied! <pre>atom.evaluate()\n</pre> atom.evaluate() Out[22]: ba f1_weighted jaccard_weighted mcc precision_weighted recall_weighted RF 0.9183 0.9181 0.8486 0.8918 0.9206 0.9182 In\u00a0[23]: Copied! <pre>atom.plot_confusion_matrix(figsize=(700, 600))\n</pre> atom.plot_confusion_matrix(figsize=(700, 600)) In\u00a0[24]: Copied! <pre>atom.plot_shap_decision(rows=0, show=15)\n</pre> atom.plot_shap_decision(rows=0, show=15) In\u00a0[25]: Copied! <pre>atom.plot_shap_beeswarm(target=0, show=15)\n</pre> atom.plot_shap_beeswarm(target=0, show=15) <pre>100%|===================| 2827/2836 [02:38&lt;00:00]        </pre>"}, {"location": "examples/nlp/#example-nlp", "title": "Example: NLP\u00b6", "text": "<p>This example shows how to use ATOM to quickly go from raw text data to model predictions.</p> <p>Import the 20 newsgroups text dataset from sklearn.datasets. The dataset comprises around 18000 articles on 20 topics. The goal is to predict the topic of every article.</p>"}, {"location": "examples/nlp/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/nlp/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/nlp/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/pruning/", "title": "Pruning", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom optuna.pruners import HyperbandPruner\nfrom atom import ATOMClassifier\n</pre> # Import packages from sklearn.datasets import load_breast_cancer from optuna.pruners import HyperbandPruner from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX, y = load_breast_cancer(return_X_y=True)\n</pre> # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied! <pre># Initialize atom\natom = ATOMClassifier(X, y, verbose=2, random_state=1)\n</pre> # Initialize atom atom = ATOMClassifier(X, y, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n</pre> In\u00a0[4]: Copied! <pre># Use ht_params to specify a custom pruner\n# Note that pruned trials show the number of iterations it completed\natom.run(\n    models=\"SGD\",\n    metric=\"f1\",\n    n_trials=25,\n    ht_params={\n        \"distributions\": [\"penalty\", \"max_iter\"],\n        \"pruner\": HyperbandPruner(),\n    }\n)\n</pre> # Use ht_params to specify a custom pruner # Note that pruned trials show the number of iterations it completed atom.run(     models=\"SGD\",     metric=\"f1\",     n_trials=25,     ht_params={         \"distributions\": [\"penalty\", \"max_iter\"],         \"pruner\": HyperbandPruner(),     } ) <pre>\nTraining ========================= &gt;&gt;\nModels: SGD\nMetric: f1\n\n\nRunning hyperparameter tuning for StochasticGradientDescent...\n| trial | penalty | max_iter |      f1 | best_f1 | time_trial | time_ht |    state |\n| ----- | ------- | -------- | ------- | ------- | ---------- | ------- | -------- |\n| 0     |      l1 |      650 |  0.9558 |  0.9558 |     2.801s |  2.801s | COMPLETE |\n| 1     | elast.. |     1050 |  0.9744 |  0.9744 |     4.590s |  7.390s | COMPLETE |\n| 2     | elast.. |      500 |  0.9828 |  0.9828 |     0.033s |  7.423s |   PRUNED |\n| 3     |    None |      700 |  0.9739 |  0.9828 |     2.951s | 10.374s | COMPLETE |\n| 4     |      l1 |     1400 |  0.9735 |  0.9828 |     0.033s | 10.407s |   PRUNED |\n| 5     |    None |     1400 |  0.9735 |  0.9828 |     5.994s | 16.401s | COMPLETE |\n| 6     |      l2 |     1200 |  0.9825 |  0.9828 |     5.246s | 21.647s | COMPLETE |\n| 7     |      l2 |     1250 |  0.9825 |  0.9828 |     5.436s | 27.083s | COMPLETE |\n| 8     |    None |      600 |  0.9828 |  0.9828 |     0.023s | 27.106s |   PRUNED |\n| 9     |      l1 |      600 |  0.9402 |  0.9828 |     0.030s | 27.136s |   PRUNED |\n| 10    |      l2 |      950 |  0.9565 |  0.9828 |     4.118s | 31.254s | COMPLETE |\n| 11    |      l2 |     1200 |  0.9825 |  0.9828 |     0.005s | 31.259s | COMPLETE |\n| 12    |      l2 |     1200 |  0.9825 |  0.9828 |     0.005s | 31.264s | COMPLETE |\n| 13    |      l2 |     1200 |  0.9825 |  0.9828 |     0.005s | 31.269s | COMPLETE |\n| 14    |      l2 |     1500 |  0.9573 |  0.9828 |     0.038s | 31.306s |   PRUNED |\n| 15    |      l2 |      950 |  0.9565 |  0.9828 |     0.005s | 31.311s | COMPLETE |\n| 16    |      l2 |     1100 |  0.9391 |  0.9828 |     0.040s | 31.351s |   PRUNED |\n| 17    |      l2 |      850 |  0.9831 |  0.9831 |     0.030s | 31.381s |   PRUNED |\n| 18    | elast.. |     1300 |   0.931 |  0.9831 |     0.029s | 31.410s |   PRUNED |\n| 19    |      l2 |     1300 |  0.9649 |  0.9831 |     0.067s | 31.478s |   PRUNED |\n| 20    |      l2 |      800 |  0.9661 |  0.9831 |     0.039s | 31.517s |   PRUNED |\n| 21    |      l2 |     1150 |  0.9402 |  0.9831 |     0.032s | 31.548s |   PRUNED |\n| 22    |      l2 |     1300 |  0.9573 |  0.9831 |     0.038s | 31.586s |   PRUNED |\n| 23    |      l2 |     1250 |  0.9825 |  0.9831 |     0.008s | 31.594s | COMPLETE |\n| 24    |      l2 |     1050 |  0.9565 |  0.9831 |     0.070s | 31.665s |   PRUNED |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 6\nBest parameters:\n --&gt; penalty: l2\n --&gt; max_iter: 1200\nBest evaluation --&gt; f1: 0.9825\nTime elapsed: 31.665s\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.993\nTest evaluation --&gt; f1: 0.9722\nTime elapsed: 8.384s\n-------------------------------------------------\nTime: 40.049s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 40.301s\n-------------------------------------\nStochasticGradientDescent --&gt; f1: 0.9722\n</pre> In\u00a0[5]: Copied! <pre>atom.plot_trials()\n</pre> atom.plot_trials() In\u00a0[6]: Copied! <pre>atom.plot_hyperparameter_importance()\n</pre> atom.plot_hyperparameter_importance()"}, {"location": "examples/pruning/#example-pruning", "title": "Example: Pruning\u00b6", "text": "<p>This example shows an advanced example on how to use hyperparameter tuning with pruning.</p> <p>Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.</p>"}, {"location": "examples/pruning/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/pruning/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/pruning/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/ray_backend/", "title": "Ray backend", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport ray\nimport pandas as pd\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import make_classification\n</pre> # Import packages import ray import pandas as pd from atom import ATOMClassifier from sklearn.datasets import make_classification In\u00a0[2]: Copied! <pre># Use a small dataset for illustration purposes\nX, y = make_classification(n_samples=10000, n_features=10, random_state=1)\n</pre> # Use a small dataset for illustration purposes X, y = make_classification(n_samples=10000, n_features=10, random_state=1) In\u00a0[3]: Copied! <pre># Note we already specify the number of cores for parallel execution here\natom = ATOMClassifier(X, y, n_jobs=2, backend=\"ray\", verbose=2, random_state=1)\n</pre> # Note we already specify the number of cores for parallel execution here atom = ATOMClassifier(X, y, n_jobs=2, backend=\"ray\", verbose=2, random_state=1) <pre>2023-11-04 23:01:00,897\tINFO worker.py:1664 -- Started a local Ray instance. View the dashboard at 127.0.0.1:8265 \n</pre> <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\nParallel processing with 2 cores.\nParallelization backend: ray\n\nDataset stats ==================== &gt;&gt;\nShape: (10000, 11)\nTrain set size: 8000\nTest set size: 2000\n-------------------------------------\nMemory: 880.13 kB\nScaled: True\nOutlier values: 211 (0.2%)\n\n</pre> In\u00a0[4]: Copied! <pre># The ray backend uses modin instead of pandas as data handler\ntype(atom.dataset)\n</pre> # The ray backend uses modin instead of pandas as data handler type(atom.dataset) Out[4]: <pre>pandas.core.frame.DataFrame</pre> In\u00a0[5]: Copied! <pre># Use data cleaning as usual\natom.scale()\n</pre> # Use data cleaning as usual atom.scale() <pre>Fitting Scaler...\nScaling features...\n</pre> In\u00a0[6]: Copied! <pre># Using `parallel=True`, we train one model in each node\n# Note that when training in parallel, the verbosity of the models is zero\natom.run(models=[\"PA\", \"SGD\"], est_params={\"max_iter\": 150}, parallel=True)\n</pre> # Using `parallel=True`, we train one model in each node # Note that when training in parallel, the verbosity of the models is zero atom.run(models=[\"PA\", \"SGD\"], est_params={\"max_iter\": 150}, parallel=True) <pre>\nTraining ========================= &gt;&gt;\nModels: PA, SGD\nMetric: f1\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 9.407s\n-------------------------------------\nPassiveAggressive         --&gt; f1: 0.8165\nStochasticGradientDescent --&gt; f1: 0.8774 !\n</pre> In\u00a0[7]: Copied! <pre># Notice how the summed time to train the models is less than the total time\natom.plot_results(metric=\"time_fit\")\n</pre> # Notice how the summed time to train the models is less than the total time atom.plot_results(metric=\"time_fit\") In\u00a0[8]: Copied! <pre># Create a rest API endpoint and do inference on the holdout set\natom.pa.serve(port=8001)\n</pre> # Create a rest API endpoint and do inference on the holdout set atom.pa.serve(port=8001) In\u00a0[9]: Copied! <pre>import requests\n\nX_predict = atom.X_test.iloc[:10, :]\nresponse = requests.get(\"http://127.0.0.1:8001/\", json=X_predict.to_json())\n</pre> import requests  X_predict = atom.X_test.iloc[:10, :] response = requests.get(\"http://127.0.0.1:8001/\", json=X_predict.to_json()) In\u00a0[10]: Copied! <pre>response.json()\n</pre> response.json() Out[10]: <pre>[1, 1, 0, 0, 1, 1, 0, 1, 0, 0]</pre> In\u00a0[11]: Copied! <pre># Don't forget to shut down the ray server\nray.shutdown()\n</pre> # Don't forget to shut down the ray server ray.shutdown()"}, {"location": "examples/ray_backend/#example-ray-backend", "title": "Example: Ray backend\u00b6", "text": "<p>This example shows how to use the ray backend to train models in a parallel context.</p> <p>The data used is a synthetic dataset created using sklearn's make_classification function.</p>"}, {"location": "examples/ray_backend/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/ray_backend/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/ray_backend/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/regression/", "title": "Regression", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMRegressor\n</pre> # Import packages import pandas as pd from atom import ATOMRegressor In\u00a0[2]: Copied! <pre># Load the data\nX = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load the data X = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\")  # Let's have a look X.head() Out[2]: Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 In\u00a0[3]: Copied! <pre># Initialize atom for regression tasks\natom = ATOMRegressor(X, \"Rings\", verbose=2, random_state=42)\n</pre> # Initialize atom for regression tasks atom = ATOMRegressor(X, \"Rings\", verbose=2, random_state=42) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Regression.\n\nDataset stats ==================== &gt;&gt;\nShape: (4177, 9)\nTrain set size: 3342\nTest set size: 835\n-------------------------------------\nMemory: 300.88 kB\nScaled: False\nCategorical features: 1 (12.5%)\nOutlier values: 195 (0.6%)\n\n</pre> In\u00a0[4]: Copied! <pre># Encode the categorical features\natom.encode()\n</pre> # Encode the categorical features atom.encode() <pre>Fitting Encoder...\nEncoding categorical columns...\n --&gt; OneHot-encoding feature Sex. Contains 3 classes.\n</pre> In\u00a0[5]: Copied! <pre># Plot the dataset's correlation matrix\natom.plot_correlation()\n</pre> # Plot the dataset's correlation matrix atom.plot_correlation() In\u00a0[6]: Copied! <pre># Apply pca for dimensionality reduction\natom.feature_selection(strategy=\"pca\", n_features=6)\n</pre> # Apply pca for dimensionality reduction atom.feature_selection(strategy=\"pca\", n_features=6) <pre>Fitting FeatureSelector...\nPerforming feature selection ...\n --&gt; Applying Principal Component Analysis...\n   --&gt; Scaling features...\n   --&gt; Keeping 6 components.\n   --&gt; Explained variance ratio: 0.97\n</pre> In\u00a0[7]: Copied! <pre># Note that the fetaures are automatically renamed to pca0, pca1, etc...\natom.columns\n</pre> # Note that the fetaures are automatically renamed to pca0, pca1, etc... atom.columns Out[7]: <pre>Index(['pca0', 'pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'Rings'], dtype='object')</pre> In\u00a0[8]: Copied! <pre># Use the plotting methods to see the retained variance ratio\natom.plot_pca()\n</pre> # Use the plotting methods to see the retained variance ratio atom.plot_pca() In\u00a0[9]: Copied! <pre>atom.plot_components()\n</pre> atom.plot_components() In\u00a0[10]: Copied! <pre>atom.run(\n    models=[\"Tree\", \"Bag\", \"ET\"],\n    metric=\"mse\",\n    n_trials=5,\n    n_bootstrap=5,\n)\n</pre> atom.run(     models=[\"Tree\", \"Bag\", \"ET\"],     metric=\"mse\",     n_trials=5,     n_bootstrap=5, ) <pre>\nTraining ========================= &gt;&gt;\nModels: Tree, Bag, ET\nMetric: mse\n\n\nRunning hyperparameter tuning for DecisionTree...\n| trial |   criterion | splitter | max_depth | min_samples_split | min_samples_leaf | max_features | ccp_alpha |     mse | best_mse | time_trial | time_ht |    state |\n| ----- | ----------- | -------- | --------- | ----------------- | ---------------- | ------------ | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     | absolute_.. |     best |         5 |                 8 |               10 |         None |     0.035 | -6.5456 |  -6.5456 |     0.255s |  0.255s | COMPLETE |\n| 1     | squared_e.. |     best |        10 |                 5 |                1 |          0.5 |      0.03 | -7.1959 |  -6.5456 |     0.065s |  0.320s | COMPLETE |\n| 2     | absolute_.. |   random |        14 |                15 |               16 |         sqrt |     0.025 | -8.5859 |  -6.5456 |     0.067s |  0.387s | COMPLETE |\n| 3     | friedman_.. |   random |         4 |                10 |               17 |          0.9 |      0.01 | -7.4933 |  -6.5456 |     0.052s |  0.439s | COMPLETE |\n| 4     |     poisson |     best |        12 |                15 |                8 |          0.6 |      0.02 | -5.8126 |  -5.8126 |     0.066s |  0.505s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 4\nBest parameters:\n --&gt; criterion: poisson\n --&gt; splitter: best\n --&gt; max_depth: 12\n --&gt; min_samples_split: 15\n --&gt; min_samples_leaf: 8\n --&gt; max_features: 0.6\n --&gt; ccp_alpha: 0.02\nBest evaluation --&gt; mse: -5.8126\nTime elapsed: 0.505s\nFit ---------------------------------------------\nTrain evaluation --&gt; mse: -6.2977\nTest evaluation --&gt; mse: -7.1923\nTime elapsed: 0.045s\nBootstrap ---------------------------------------\nEvaluation --&gt; mse: -7.6026 \u00b1 0.3783\nTime elapsed: 0.110s\n-------------------------------------------------\nTime: 0.660s\n\n\nRunning hyperparameter tuning for Bagging...\n| trial | n_estimators | max_samples | max_features | bootstrap | bootstrap_features |     mse | best_mse | time_trial | time_ht |    state |\n| ----- | ------------ | ----------- | ------------ | --------- | ------------------ | ------- | -------- | ---------- | ------- | -------- |\n| 0     |          190 |         1.0 |          0.9 |      True |               True | -4.5751 |  -4.5751 |     5.791s |  5.791s | COMPLETE |\n\nException encountered while running the Bag model.\nMemoryError: could not allocate 187712 bytes\n\n\nRunning hyperparameter tuning for ExtraTrees...\n| trial | n_estimators |     criterion | max_depth | min_samples_split | min_samples_leaf | max_features | bootstrap | max_samples | ccp_alpha |     mse | best_mse | time_trial | time_ht |    state |\n| ----- | ------------ | ------------- | --------- | ----------------- | ---------------- | ------------ | --------- | ----------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |          190 | squared_error |         8 |                13 |                3 |          0.5 |      True |         0.6 |     0.025 | -5.1462 |  -5.1462 |     0.285s |  0.285s | COMPLETE |\n| 1     |          230 | absolute_er.. |         8 |                 8 |                8 |         sqrt |      True |         0.6 |       0.0 | -9.3444 |  -5.1462 |     1.377s |  1.662s | COMPLETE |\n| 2     |          180 | absolute_er.. |         7 |                 2 |                3 |          0.6 |      True |         0.6 |      0.03 | -5.7371 |  -5.1462 |     1.738s |  3.400s | COMPLETE |\n| 3     |          100 | squared_error |        14 |                15 |                8 |         None |      True |         0.9 |     0.005 | -5.1938 |  -5.1462 |     0.231s |  3.631s | COMPLETE |\n| 4     |          340 | squared_error |         6 |                15 |                8 |         None |      True |         0.8 |      0.01 | -4.8716 |  -4.8716 |     0.457s |  4.088s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 4\nBest parameters:\n --&gt; n_estimators: 340\n --&gt; criterion: squared_error\n --&gt; max_depth: 6\n --&gt; min_samples_split: 15\n --&gt; min_samples_leaf: 8\n --&gt; max_features: None\n --&gt; bootstrap: True\n --&gt; max_samples: 0.8\n --&gt; ccp_alpha: 0.01\nBest evaluation --&gt; mse: -4.8716\nTime elapsed: 4.088s\nFit ---------------------------------------------\nTrain evaluation --&gt; mse: -5.4808\nTest evaluation --&gt; mse: -6.3445\nTime elapsed: 0.535s\nBootstrap ---------------------------------------\nEvaluation --&gt; mse: -6.3694 \u00b1 0.0737\nTime elapsed: 2.245s\n-------------------------------------------------\nTime: 6.868s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 32.361s\n-------------------------------------\nDecisionTree --&gt; mse: -7.6026 \u00b1 0.3783\nExtraTrees   --&gt; mse: -6.3694 \u00b1 0.0737 !\n</pre> In\u00a0[11]: Copied! <pre># Use the errors or residuals plots to check the model performances\natom.plot_residuals()\n</pre> # Use the errors or residuals plots to check the model performances atom.plot_residuals() In\u00a0[12]: Copied! <pre>atom.plot_errors()\n</pre> atom.plot_errors() In\u00a0[13]: Copied! <pre># Analyze the relation between the target response and the features\natom.plot_partial_dependence(columns=(0, 1, 2, 3))\n</pre> # Analyze the relation between the target response and the features atom.plot_partial_dependence(columns=(0, 1, 2, 3))"}, {"location": "examples/regression/#example-regression", "title": "Example: Regression\u00b6", "text": "<p>This example shows how to use ATOM to apply pca on the data and run a regression pipeline.</p> <p>Download the abalone dataset from https://archive.ics.uci.edu/ml/datasets/Abalone. The goal of this dataset is to predict the rings (age) of abalone shells from physical measurements.</p>"}, {"location": "examples/regression/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/regression/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/regression/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/successive_halving/", "title": "Successive halving", "text": "In\u00a0[1]: Copied! <pre>from sklearn.datasets import fetch_california_housing\nfrom atom import ATOMRegressor\n</pre> from sklearn.datasets import fetch_california_housing from atom import ATOMRegressor In\u00a0[2]: Copied! <pre># Load the data\nX, y = fetch_california_housing(return_X_y=True)\n</pre> # Load the data X, y = fetch_california_housing(return_X_y=True) In\u00a0[3]: Copied! <pre>atom = ATOMRegressor(X, y, verbose=2, random_state=1)\n</pre> atom = ATOMRegressor(X, y, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Regression.\n\nDataset stats ==================== &gt;&gt;\nShape: (20640, 9)\nTrain set size: 16512\nTest set size: 4128\n-------------------------------------\nMemory: 1.49 MB\nScaled: False\nOutlier values: 786 (0.5%)\n\n</pre> In\u00a0[4]: Copied! <pre># Compare tree-based models via successive halving\natom.successive_halving(\n    models=[\"Tree\", \"Bag\", \"ET\", \"RF\", \"LGB\", \"CatB\"],\n    metric=\"mae\",\n    n_bootstrap=5,\n)\n</pre> # Compare tree-based models via successive halving atom.successive_halving(     models=[\"Tree\", \"Bag\", \"ET\", \"RF\", \"LGB\", \"CatB\"],     metric=\"mae\",     n_bootstrap=5, ) <pre>\nTraining ========================= &gt;&gt;\nMetric: mae\n\n\nRun: 0 =========================== &gt;&gt;\nModels: Tree6, Bag6, ET6, RF6, LGB6, CatB6\nSize of training set: 16512 (17%)\nSize of test set: 4128\n\n\nResults for DecisionTree:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.0\nTest evaluation --&gt; mae: -0.5394\nTime elapsed: 0.103s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.576 \u00b1 0.0119\nTime elapsed: 0.422s\n-------------------------------------------------\nTime: 0.525s\n\n\nResults for Bagging:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.1715\nTest evaluation --&gt; mae: -0.4308\nTime elapsed: 0.450s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.435 \u00b1 0.0059\nTime elapsed: 2.061s\n-------------------------------------------------\nTime: 2.511s\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.0\nTest evaluation --&gt; mae: -0.3977\nTime elapsed: 1.574s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.4059 \u00b1 0.0028\nTime elapsed: 7.107s\n-------------------------------------------------\nTime: 8.681s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.1508\nTest evaluation --&gt; mae: -0.4053\nTime elapsed: 4.178s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.4162 \u00b1 0.0031\nTime elapsed: 18.156s\n-------------------------------------------------\nTime: 22.335s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.2031\nTest evaluation --&gt; mae: -0.3594\nTime elapsed: 0.438s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.3673 \u00b1 0.0016\nTime elapsed: 0.886s\n-------------------------------------------------\nTime: 1.324s\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.1621\nTest evaluation --&gt; mae: -0.3483\nTime elapsed: 5.084s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.3554 \u00b1 0.0025\nTime elapsed: 20.177s\n-------------------------------------------------\nTime: 25.261s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 01m:01s\n-------------------------------------\nDecisionTree --&gt; mae: -0.576 \u00b1 0.0119 ~\nBagging      --&gt; mae: -0.435 \u00b1 0.0059 ~\nExtraTrees   --&gt; mae: -0.4059 \u00b1 0.0028 ~\nRandomForest --&gt; mae: -0.4162 \u00b1 0.0031 ~\nLightGBM     --&gt; mae: -0.3673 \u00b1 0.0016 ~\nCatBoost     --&gt; mae: -0.3554 \u00b1 0.0025 ~ !\n\n\nRun: 1 =========================== &gt;&gt;\nModels: ET3, LGB3, CatB3\nSize of training set: 16512 (33%)\nSize of test set: 4128\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.0\nTest evaluation --&gt; mae: -0.3739\nTime elapsed: 2.738s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.3841 \u00b1 0.0027\nTime elapsed: 11.259s\n-------------------------------------------------\nTime: 13.997s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.2327\nTest evaluation --&gt; mae: -0.3356\nTime elapsed: 0.389s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.345 \u00b1 0.0037\nTime elapsed: 0.876s\n-------------------------------------------------\nTime: 1.265s\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.1882\nTest evaluation --&gt; mae: -0.3255\nTime elapsed: 4.800s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.3352 \u00b1 0.0023\nTime elapsed: 22.708s\n-------------------------------------------------\nTime: 27.509s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 43.130s\n-------------------------------------\nExtraTrees --&gt; mae: -0.3841 \u00b1 0.0027 ~\nLightGBM   --&gt; mae: -0.345 \u00b1 0.0037 ~\nCatBoost   --&gt; mae: -0.3352 \u00b1 0.0023 ~ !\n\n\nRun: 2 =========================== &gt;&gt;\nModels: CatB1\nSize of training set: 16512 (100%)\nSize of test set: 4128\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.2229\nTest evaluation --&gt; mae: -0.2986\nTime elapsed: 6.851s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.3091 \u00b1 0.0026\nTime elapsed: 33.428s\n-------------------------------------------------\nTime: 40.279s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 40.375s\n-------------------------------------\nCatBoost --&gt; mae: -0.3091 \u00b1 0.0026 ~\n</pre> In\u00a0[5]: Copied! <pre># The results is now multi-index, where frac is the fraction\n# of the training set used to fit the model. The model names\n# end with the number of models fitted during that run\natom.results\n</pre> # The results is now multi-index, where frac is the fraction # of the training set used to fit the model. The model names # end with the number of models fitted during that run atom.results Out[5]: mae_train mae_test time_fit mae_bootstrap time_bootstrap time frac model 0.17 Bag6 -0.2017 -0.4327 0.450035 -0.434981 2.061373 2.511408 CatB6 -0.2065 -0.3557 5.083625 -0.355352 20.176994 25.260619 ET6 -0.0694 -0.4077 1.574000 -0.405855 7.106890 8.680890 LGB6 -0.2202 -0.3676 0.438399 -0.367271 0.885806 1.324205 RF6 -0.1851 -0.4165 4.178345 -0.416217 18.156310 22.334655 Tree6 -0.1039 -0.5897 0.102987 -0.575962 0.422224 0.525211 0.33 CatB3 -0.2249 -0.3384 4.800246 -0.335246 22.708465 27.508711 ET3 -0.0935 -0.3879 2.738315 -0.384081 11.258794 13.997109 LGB3 -0.2489 -0.3405 0.389353 -0.344951 0.875797 1.265150 1.00 CatB1 -0.2447 -0.3066 6.851350 -0.309112 33.428059 40.279409 In\u00a0[6]: Copied! <pre># Plot the successive halving's results\natom.plot_successive_halving()\n</pre> # Plot the successive halving's results atom.plot_successive_halving() In\u00a0[7]: Copied! <pre># Use regex to call all the models with the same estimator...\natom.plot_errors(models=[\"CatB.*\"])\n</pre> # Use regex to call all the models with the same estimator... atom.plot_errors(models=[\"CatB.*\"]) In\u00a0[8]: Copied! <pre># ...or to call the models from the same run\natom.plot_errors(models=\".*3\")\n</pre> # ...or to call the models from the same run atom.plot_errors(models=\".*3\")"}, {"location": "examples/successive_halving/#example-successive-halving", "title": "Example: Successive halving\u00b6", "text": "<p>This example shows how to compare multiple tree-based models using successive halving.</p> <p>Import the california housing dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict house prices.</p>"}, {"location": "examples/successive_halving/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/successive_halving/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/successive_halving/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/train_sizing/", "title": "Train sizing", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load the data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre># Initialize atom and prepare the data\natom = ATOMClassifier(X, verbose=2, random_state=1)\natom.clean()\natom.impute(strat_num=\"median\", strat_cat=\"most_frequent\", max_nan_rows=0.8)\natom.encode()\n</pre> # Initialize atom and prepare the data atom = ATOMClassifier(X, verbose=2, random_state=1) atom.clean() atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\", max_nan_rows=0.8) atom.encode() <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nFitting Cleaner...\nCleaning the data...\nFitting Imputer...\nImputing missing values...\n --&gt; Dropping 161 samples for containing more than 16 missing values.\n --&gt; Imputing 481 missing values with median (12.0) in feature MinTemp.\n --&gt; Imputing 265 missing values with median (22.6) in feature MaxTemp.\n --&gt; Imputing 1354 missing values with median (0.0) in feature Rainfall.\n --&gt; Imputing 60682 missing values with median (4.8) in feature Evaporation.\n --&gt; Imputing 67659 missing values with median (8.4) in feature Sunshine.\n --&gt; Imputing 9187 missing values with most_frequent (W) in feature WindGustDir.\n --&gt; Imputing 9127 missing values with median (39.0) in feature WindGustSpeed.\n --&gt; Imputing 9852 missing values with most_frequent (N) in feature WindDir9am.\n --&gt; Imputing 3617 missing values with most_frequent (SE) in feature WindDir3pm.\n --&gt; Imputing 1187 missing values with median (13.0) in feature WindSpeed9am.\n --&gt; Imputing 2469 missing values with median (19.0) in feature WindSpeed3pm.\n --&gt; Imputing 1613 missing values with median (70.0) in feature Humidity9am.\n --&gt; Imputing 3449 missing values with median (52.0) in feature Humidity3pm.\n --&gt; Imputing 13863 missing values with median (1017.6) in feature Pressure9am.\n --&gt; Imputing 13830 missing values with median (1015.2) in feature Pressure3pm.\n --&gt; Imputing 53496 missing values with median (5.0) in feature Cloud9am.\n --&gt; Imputing 56933 missing values with median (5.0) in feature Cloud3pm.\n --&gt; Imputing 743 missing values with median (16.7) in feature Temp9am.\n --&gt; Imputing 2565 missing values with median (21.1) in feature Temp3pm.\n --&gt; Imputing 1354 missing values with most_frequent (No) in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 49 classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n</pre> In\u00a0[4]: Copied! <pre># Analyze the impact of the training set's size on a LR model\natom.train_sizing(\"LR\", train_sizes=10, n_bootstrap=5)\n</pre> # Analyze the impact of the training set's size on a LR model atom.train_sizing(\"LR\", train_sizes=10, n_bootstrap=5) <pre>\nTraining ========================= &gt;&gt;\nMetric: f1\n\n\nRun: 0 =========================== &gt;&gt;\nModels: LR01\nSize of training set: 11362 (10%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5624\nTest evaluation --&gt; f1: 0.5857\nTime elapsed: 0.721s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.585 \u00b1 0.0021\nTime elapsed: 0.729s\n-------------------------------------------------\nTime: 1.449s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 2.053s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.585 \u00b1 0.0021\n\n\nRun: 1 =========================== &gt;&gt;\nModels: LR02\nSize of training set: 22724 (20%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.582\nTest evaluation --&gt; f1: 0.5874\nTime elapsed: 0.853s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5851 \u00b1 0.002\nTime elapsed: 0.865s\n-------------------------------------------------\nTime: 1.718s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 2.425s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5851 \u00b1 0.002\n\n\nRun: 2 =========================== &gt;&gt;\nModels: LR03\nSize of training set: 34087 (30%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5812\nTest evaluation --&gt; f1: 0.585\nTime elapsed: 1.086s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5861 \u00b1 0.0009\nTime elapsed: 1.119s\n-------------------------------------------------\nTime: 2.205s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 3.035s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5861 \u00b1 0.0009\n\n\nRun: 3 =========================== &gt;&gt;\nModels: LR04\nSize of training set: 45449 (40%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5828\nTest evaluation --&gt; f1: 0.5862\nTime elapsed: 1.173s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5863 \u00b1 0.0017\nTime elapsed: 1.282s\n-------------------------------------------------\nTime: 2.455s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 3.365s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5863 \u00b1 0.0017\n\n\nRun: 4 =========================== &gt;&gt;\nModels: LR05\nSize of training set: 56812 (50%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5823\nTest evaluation --&gt; f1: 0.5853\nTime elapsed: 1.264s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.585 \u00b1 0.0016\nTime elapsed: 1.460s\n-------------------------------------------------\nTime: 2.724s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 3.758s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.585 \u00b1 0.0016\n\n\nRun: 5 =========================== &gt;&gt;\nModels: LR06\nSize of training set: 68174 (60%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5835\nTest evaluation --&gt; f1: 0.5843\nTime elapsed: 1.392s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.585 \u00b1 0.0016\nTime elapsed: 1.704s\n-------------------------------------------------\nTime: 3.095s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 4.151s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.585 \u00b1 0.0016\n\n\nRun: 6 =========================== &gt;&gt;\nModels: LR07\nSize of training set: 79536 (70%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5872\nTest evaluation --&gt; f1: 0.5846\nTime elapsed: 1.585s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5852 \u00b1 0.0013\nTime elapsed: 1.836s\n-------------------------------------------------\nTime: 3.421s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 4.664s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5852 \u00b1 0.0013\n\n\nRun: 7 =========================== &gt;&gt;\nModels: LR08\nSize of training set: 90899 (80%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5889\nTest evaluation --&gt; f1: 0.5841\nTime elapsed: 1.693s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5852 \u00b1 0.0025\nTime elapsed: 2.139s\n-------------------------------------------------\nTime: 3.832s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.157s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5852 \u00b1 0.0025\n\n\nRun: 8 =========================== &gt;&gt;\nModels: LR09\nSize of training set: 102261 (90%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5871\nTest evaluation --&gt; f1: 0.5837\nTime elapsed: 1.754s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5844 \u00b1 0.0022\nTime elapsed: 2.353s\n-------------------------------------------------\nTime: 4.107s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.464s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5844 \u00b1 0.0022\n\n\nRun: 9 =========================== &gt;&gt;\nModels: LR10\nSize of training set: 113624 (100%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5856\nTest evaluation --&gt; f1: 0.585\nTime elapsed: 1.978s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5846 \u00b1 0.0005\nTime elapsed: 2.544s\n-------------------------------------------------\nTime: 4.521s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.975s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5846 \u00b1 0.0005\n</pre> In\u00a0[5]: Copied! <pre># The results are now multi-index, where frac is the fraction\n# of the training set used to fit the model. The model names\n# end with the fraction as well (without the dot)\natom.results\n</pre> # The results are now multi-index, where frac is the fraction # of the training set used to fit the model. The model names # end with the fraction as well (without the dot) atom.results Out[5]: f1_train f1_test time_fit f1_bootstrap time_bootstrap time frac model 0.1 LR01 0.5622 0.5852 0.720655 0.585044 0.728664 1.449319 0.2 LR02 0.5830 0.5845 0.852776 0.585144 0.864794 1.717570 0.3 LR03 0.5795 0.5856 1.085709 0.586101 1.119410 2.205119 0.4 LR04 0.5847 0.5858 1.173066 0.586305 1.282166 2.455232 0.5 LR05 0.5836 0.5862 1.264150 0.585003 1.460329 2.724479 0.6 LR06 0.5832 0.5833 1.391943 0.584966 1.703550 3.095493 0.7 LR07 0.5880 0.5856 1.585444 0.585199 1.835532 3.420976 0.8 LR08 0.5914 0.5882 1.693054 0.585235 2.138652 3.831706 0.9 LR09 0.5854 0.5828 1.753595 0.584420 2.353141 4.106736 1.0 LR10 0.5862 0.5850 1.977799 0.584634 2.543574 4.521373 In\u00a0[6]: Copied! <pre># Every model can be accessed through its name\natom.lr05.plot_shap_waterfall(show=6)\n</pre> # Every model can be accessed through its name atom.lr05.plot_shap_waterfall(show=6) In\u00a0[7]: Copied! <pre># Plot the train sizing's results\natom.plot_learning_curve()\n</pre> # Plot the train sizing's results atom.plot_learning_curve()"}, {"location": "examples/train_sizing/#example-train-sizing", "title": "Example: Train sizing\u00b6", "text": "<p>This example shows how to asses a model's performance based on the size of the training set.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/train_sizing/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/train_sizing/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/train_sizing/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/utilities/", "title": "Utilities", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport tempfile\nimport pandas as pd\nfrom sklearn.metrics import fbeta_score\nfrom atom import ATOMClassifier\n</pre> # Import packages import tempfile import pandas as pd from sklearn.metrics import fbeta_score from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X, random_state=1)\natom.clean()\n\n# Quickly check what columns have missing values\nprint(f\"Columns with missing values:\\n{atom.nans}\")\n\n# Or what columns are categorical\nprint(f\"\\nCategorical columns: {atom.categorical}\")\n\n# Or if the dataset is scaled\nprint(f\"\\nIs the dataset scaled? {atom.scaled}\")\n</pre> atom = ATOMClassifier(X, random_state=1) atom.clean()  # Quickly check what columns have missing values print(f\"Columns with missing values:\\n{atom.nans}\")  # Or what columns are categorical print(f\"\\nCategorical columns: {atom.categorical}\")  # Or if the dataset is scaled print(f\"\\nIs the dataset scaled? {atom.scaled}\") <pre>Columns with missing values:\nLocation             0\nMinTemp            637\nMaxTemp            322\nRainfall          1406\nEvaporation      60843\nSunshine         67816\nWindGustDir       9330\nWindGustSpeed     9270\nWindDir9am       10013\nWindDir3pm        3778\nWindSpeed9am      1348\nWindSpeed3pm      2630\nHumidity9am       1774\nHumidity3pm       3610\nPressure9am      14014\nPressure3pm      13981\nCloud9am         53657\nCloud3pm         57094\nTemp9am            904\nTemp3pm           2726\nRainToday         1406\ndtype: int64\n\nCategorical columns: Index(['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday'], dtype='object')\n\nIs the dataset scaled? False\n</pre> In\u00a0[4]: Copied! <pre># Note the number of missing values and categorical columns\natom.stats()\n</pre> # Note the number of missing values and categorical columns atom.stats() <pre>Dataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 27.44 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n</pre> In\u00a0[5]: Copied! <pre># Now, let's impute and encode the dataset...\natom.impute()\natom.encode()\n\n# ... and the values are gone\natom.stats()\n</pre> # Now, let's impute and encode the dataset... atom.impute() atom.encode()  # ... and the values are gone atom.stats() <pre>Dataset stats ==================== &gt;&gt;\nShape: (56420, 22)\nTrain set size: 45075\nTest set size: 11345\n-------------------------------------\nMemory: 11.11 MB\nScaled: False\nOutlier values: 3203 (0.3%)\n</pre> In\u00a0[6]: Copied! <pre># Compare the relationship of multiple columns with a scatter maxtrix\natom.plot_relationships(columns=slice(0, 5))\n</pre> # Compare the relationship of multiple columns with a scatter maxtrix atom.plot_relationships(columns=slice(0, 5)) In\u00a0[7]: Copied! <pre># Check which distribution fits a column best\natom.distribution(columns=\"Rainfall\")\n</pre> # Check which distribution fits a column best atom.distribution(columns=\"Rainfall\") Out[7]: Rainfall dist stat beta score 0.6506 p_value 0.0 expon score 0.6506 p_value 0.0 gamma score 0.6465 p_value 0.0 invgauss score 0.6257 p_value 0.0 lognorm score 0.6485 p_value 0.0 norm score 0.3807 p_value 0.0 pearson3 score 0.6506 p_value 0.0 triang score 0.7191 p_value 0.0 uniform score 0.8914 p_value 0.0 weibull_min score 0.6506 p_value 0.0 weibull_max score 0.8896 p_value 0.0 In\u00a0[8]: Copied! <pre># Investigate a column's distribution\natom.plot_distribution(columns=\"MinTemp\", distributions=\"beta\")\natom.plot_qq(columns=\"MinTemp\", distributions=\"beta\")\n</pre> # Investigate a column's distribution atom.plot_distribution(columns=\"MinTemp\", distributions=\"beta\") atom.plot_qq(columns=\"MinTemp\", distributions=\"beta\") <p>There are two ways to quickly transform the dataset mid-pipeline. The first way is through the property's <code>@setter</code>. The downside for this approach is that the transformation is not stored in atom's pipeline, so the transformation is not applied on new data. Therefore, we recommend using the second approach, through the add method.</p> In\u00a0[9]: Copied! <pre># Note that we can only replace a dataframe with a new dataframe!\natom.X = atom.X.assign(AvgTemp=(atom.X[\"MaxTemp\"] + atom.X[\"MinTemp\"])/2)\n\n# This will automatically update all other data attributes\nassert \"AvgTemp\" in atom\n\n# But it's not saved to atom's pipeline\natom.pipeline\n</pre> # Note that we can only replace a dataframe with a new dataframe! atom.X = atom.X.assign(AvgTemp=(atom.X[\"MaxTemp\"] + atom.X[\"MinTemp\"])/2)  # This will automatically update all other data attributes assert \"AvgTemp\" in atom  # But it's not saved to atom's pipeline atom.pipeline Out[9]: <pre>Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare'))])</pre>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.Pipeline<pre>Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare'))])</pre>Cleaner<pre>Cleaner()</pre>Imputer<pre>Imputer()</pre>Encoder<pre>Encoder(value='rare')</pre> In\u00a0[10]: Copied! <pre># Same transformation, different approach (AvgTemp is overwritten)\ndef transform(df):\n    df[\"AvgTemp\"] = (df.MaxTemp + df.MinTemp) / 2\n    return df\n\natom.apply(transform)\n\nassert \"AvgTemp\" in atom\n</pre> # Same transformation, different approach (AvgTemp is overwritten) def transform(df):     df[\"AvgTemp\"] = (df.MaxTemp + df.MinTemp) / 2     return df  atom.apply(transform)  assert \"AvgTemp\" in atom In\u00a0[11]: Copied! <pre># Now the function appears in the pipeline\natom.pipeline\n</pre> # Now the function appears in the pipeline atom.pipeline Out[11]: <pre>Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare')),\n                ('FunctionTransformer',\n                 FunctionTransformer(func=&lt;function transform at 0x0000016745DF6B90&gt;))])</pre>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.Pipeline<pre>Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare')),\n                ('FunctionTransformer',\n                 FunctionTransformer(func=&lt;function transform at 0x0000016745DF6B90&gt;))])</pre>Cleaner<pre>Cleaner()</pre>Imputer<pre>Imputer()</pre>Encoder<pre>Encoder(value='rare')</pre>FunctionTransformer<pre>FunctionTransformer(func=&lt;function transform at 0x0000016745DF6B90&gt;)</pre> In\u00a0[12]: Copied! <pre>atom.available_models()\n</pre> atom.available_models() Out[12]: acronym model estimator module needs_scaling accepts_sparse native_multilabel native_multioutput has_validation supports_engines 0 AdaB AdaBoost AdaBoostClassifier sklearnensemble False True False False False sklearn 1 Bag Bagging BaggingClassifier sklearnensemble False True False False False sklearn 2 BNB BernoulliNB BernoulliNB sklearnnaive_bayes False True False False False sklearn, cuml 3 CatB CatBoost CatBoostClassifier catboostcatboost True True False False True catboost 4 CatNB CategoricalNB CategoricalNB sklearnnaive_bayes False True False False False sklearn, cuml 5 CNB ComplementNB ComplementNB sklearnnaive_bayes False True False False False sklearn, cuml 6 Tree DecisionTree DecisionTreeClassifier sklearntree False True True True False sklearn 7 Dummy Dummy DummyClassifier sklearndummy False False False False False sklearn 8 ETree ExtraTree ExtraTreeClassifier sklearntree False True True True False sklearn 9 ET ExtraTrees ExtraTreesClassifier sklearnensemble False True True True False sklearn 10 GNB GaussianNB GaussianNB sklearnnaive_bayes False False False False False sklearn, cuml 11 GP GaussianProcess GaussianProcessClassifier sklearngaussian_process False False False False False sklearn 12 GBM GradientBoostingMachine GradientBoostingClassifier sklearnensemble False True False False False sklearn 13 hGBM HistGradientBoosting HistGradientBoostingClassifier sklearnensemble False False False False False sklearn 14 KNN KNearestNeighbors KNeighborsClassifier sklearnneighbors True True True True False sklearn, sklearnex, cuml 15 LGB LightGBM LGBMClassifier lightgbmlightgbm.sklearn True True False False True lightgbm 16 LDA LinearDiscriminantAnalysis LinearDiscriminantAnalysis sklearndiscriminant_analysis False False False False False sklearn 17 lSVM LinearSVM LinearSVC sklearnsvm True True False False False sklearn, cuml 18 LR LogisticRegression LogisticRegression sklearnlinear_model True True False False False sklearn, sklearnex, cuml 19 MLP MultiLayerPerceptron MLPClassifier sklearnneural_network True True True False True sklearn 20 MNB MultinomialNB MultinomialNB sklearnnaive_bayes False True False False False sklearn, cuml 21 PA PassiveAggressive PassiveAggressiveClassifier sklearnlinear_model True True False False True sklearn 22 Perc Perceptron Perceptron sklearnlinear_model True False False False True sklearn 23 QDA QuadraticDiscriminantAnalysis QuadraticDiscriminantAnalysis sklearndiscriminant_analysis False False False False False sklearn 24 RNN RadiusNearestNeighbors RadiusNeighborsClassifier sklearnneighbors True True True True False sklearn 25 RF RandomForest RandomForestClassifier sklearnensemble False True True True False sklearn, sklearnex, cuml 26 Ridge Ridge RidgeClassifier sklearnlinear_model True True True False False sklearn, sklearnex, cuml 27 SGD StochasticGradientDescent SGDClassifier sklearnlinear_model True True False False True sklearn 28 SVM SupportVectorMachine SVC sklearnsvm True True False False False sklearn, sklearnex, cuml 29 XGB XGBoost XGBClassifier xgboostxgboost True True False False True xgboost In\u00a0[13]: Copied! <pre>atom.verbose = 1\n\n# Define a custom metric\ndef f2(y_true, y_pred):\n    return fbeta_score(y_true, y_pred, beta=2)\n\n# Use the greater_is_better, needs_proba and needs_threshold parameters if necessary\natom.run(models=\"LR\", metric=f2)\n</pre> atom.verbose = 1  # Define a custom metric def f2(y_true, y_pred):     return fbeta_score(y_true, y_pred, beta=2)  # Use the greater_is_better, needs_proba and needs_threshold parameters if necessary atom.run(models=\"LR\", metric=f2) <pre>\nTraining ========================= &gt;&gt;\nModels: LR\nMetric: f2\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f2: 0.5693\nTest evaluation --&gt; f2: 0.5709\nTime elapsed: 0.863s\n-------------------------------------------------\nTime: 0.863s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.491s\n-------------------------------------\nLogisticRegression --&gt; f2: 0.5709\n</pre> In\u00a0[14]: Copied! <pre># You can use the est_params parameter to customize the estimator\n# Let's run AdaBoost using LR instead of a decision tree as base estimator\natom.run(\"AdaB\", est_params={\"base_estimator\": atom.lr.estimator})\n</pre> # You can use the est_params parameter to customize the estimator # Let's run AdaBoost using LR instead of a decision tree as base estimator atom.run(\"AdaB\", est_params={\"base_estimator\": atom.lr.estimator}) <pre>\nTraining ========================= &gt;&gt;\nModels: AdaB\nMetric: f2\n\n\nResults for AdaBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; f2: 0.556\nTest evaluation --&gt; f2: 0.5636\nTime elapsed: 2.568s\n-------------------------------------------------\nTime: 2.568s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 3.065s\n-------------------------------------\nAdaBoost --&gt; f2: 0.5636\n</pre> In\u00a0[15]: Copied! <pre>atom.adab.estimator\n</pre> atom.adab.estimator Out[15]: <pre>AdaBoostClassifier(base_estimator=LogisticRegression(n_jobs=1, random_state=1),\n                   random_state=1)</pre>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.AdaBoostClassifier<pre>AdaBoostClassifier(base_estimator=LogisticRegression(n_jobs=1, random_state=1),\n                   random_state=1)</pre>base_estimator: LogisticRegression<pre>LogisticRegression(n_jobs=1, random_state=1)</pre>LogisticRegression<pre>LogisticRegression(n_jobs=1, random_state=1)</pre> In\u00a0[16]: Copied! <pre># Note that parameters specified by est_params are not optimized in the BO\natom.run(\n    models=\"Tree\",\n    n_trials=10,\n    est_params={\n        \"criterion\": \"gini\",\n        \"splitter\": \"best\",\n        \"min_samples_leaf\": 1,\n        \"ccp_alpha\": 0.035,\n    },\n    verbose=2,\n)\n</pre> # Note that parameters specified by est_params are not optimized in the BO atom.run(     models=\"Tree\",     n_trials=10,     est_params={         \"criterion\": \"gini\",         \"splitter\": \"best\",         \"min_samples_leaf\": 1,         \"ccp_alpha\": 0.035,     },     verbose=2, ) <pre>\nTraining ========================= &gt;&gt;\nModels: Tree\nMetric: f2\n\n\nRunning hyperparameter tuning for DecisionTree...\n| trial | max_depth | min_samples_split | max_features |      f2 | best_f2 | time_trial | time_ht |    state |\n| ----- | --------- | ----------------- | ------------ | ------- | ------- | ---------- | ------- | -------- |\n| 0     |        13 |                12 |          0.5 |  0.4362 |  0.4362 |     3.161s |  3.161s | COMPLETE |\n| 1     |        14 |                16 |         log2 |  0.4729 |  0.4729 |     2.872s |  6.033s | COMPLETE |\n| 2     |        16 |                13 |          0.8 |  0.4626 |  0.4729 |     3.201s |  9.234s | COMPLETE |\n| 3     |         9 |                 6 |         None |  0.4903 |  0.4903 |     3.075s | 12.309s | COMPLETE |\n| 4     |         5 |                 2 |         log2 |  0.4889 |  0.4903 |     2.812s | 15.121s | COMPLETE |\n| 5     |         1 |                15 |          0.5 |  0.4953 |  0.4953 |     2.827s | 17.948s | COMPLETE |\n| 6     |        15 |                 9 |         sqrt |  0.5004 |  0.5004 |     2.951s | 20.899s | COMPLETE |\n| 7     |        13 |                20 |         None |  0.5004 |  0.5004 |     3.242s | 24.141s | COMPLETE |\n| 8     |         3 |                19 |          0.5 |  0.4936 |  0.5004 |     2.800s | 26.941s | COMPLETE |\n| 9     |        15 |                20 |         sqrt |  0.4762 |  0.5004 |     3.170s | 30.111s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 6\nBest parameters:\n --&gt; max_depth: 15\n --&gt; min_samples_split: 9\n --&gt; max_features: sqrt\nBest evaluation --&gt; f2: 0.5004\nTime elapsed: 30.111s\nFit ---------------------------------------------\nTrain evaluation --&gt; f2: 0.4925\nTest evaluation --&gt; f2: 0.4925\nTime elapsed: 0.452s\n-------------------------------------------------\nTime: 30.563s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 30.885s\n-------------------------------------\nDecisionTree --&gt; f2: 0.4925\n</pre> <p>Note that both instances need to be initialized with the same data and use the same metric for model training to be able to merge.</p> In\u00a0[17]: Copied! <pre>tempdir = tempfile.gettempdir()\n</pre> tempdir = tempfile.gettempdir() In\u00a0[18]: Copied! <pre># Save the atom instance as a pickle\n# Use save_data=False to save the instance without the data\natom.save(tempdir + \"atom\", save_data=False)\n</pre> # Save the atom instance as a pickle # Use save_data=False to save the instance without the data atom.save(tempdir + \"atom\", save_data=False) <pre>ATOMClassifier successfully saved.\n</pre> In\u00a0[20]: Copied! <pre># No need to store the transformed data, providing the original dataset to\n# the loader automatically transforms it through all the steps in the pipeline\natom_2 = ATOMClassifier.load(tempdir + \"atom\", data=(X,))\n</pre> # No need to store the transformed data, providing the original dataset to # the loader automatically transforms it through all the steps in the pipeline atom_2 = ATOMClassifier.load(tempdir + \"atom\", data=(X,)) <pre>ATOMClassifier successfully loaded.\n</pre> In\u00a0[21]: Copied! <pre># Create a separate instance with its own branch and model\natom_3 = ATOMClassifier(X, verbose=0, random_state=1)\natom_3.branch.name = \"lightgbm\"\natom_3.impute()\natom_3.encode()\natom_3.run(\"LGB\", metric=f2)\n</pre> # Create a separate instance with its own branch and model atom_3 = ATOMClassifier(X, verbose=0, random_state=1) atom_3.branch.name = \"lightgbm\" atom_3.impute() atom_3.encode() atom_3.run(\"LGB\", metric=f2) In\u00a0[22]: Copied! <pre># Merge the instances\natom_2.merge(atom_3)\n</pre> # Merge the instances atom_2.merge(atom_3) <pre>Merging instances...\n --&gt; Merging branch lightgbm.\n --&gt; Merging model LGB.\n --&gt; Merging attributes.\n</pre> In\u00a0[23]: Copied! <pre># Note that it now contains both branches and all models\natom_2\n</pre> # Note that it now contains both branches and all models atom_2 Out[23]: <pre>ATOMClassifier\n --&gt; Branches:\n   --&gt; main !\n   --&gt; lightgbm\n --&gt; Models: LR, AdaB, Tree, LGB\n --&gt; Metric: f2</pre> In\u00a0[24]: Copied! <pre>atom_2.results\n</pre> atom_2.results Out[24]: f2_train f2_test time_fit time frac model 0.8 AdaB 0.5599 0.5590 2.568021 2.568021 LR 0.5723 0.5685 0.863496 0.863496 Tree 0.4930 0.4928 0.452411 30.563017 1.0 LGB 0.6578 0.5909 3.991159 3.991159"}, {"location": "examples/utilities/#example-utilities", "title": "Example: Utilities\u00b6", "text": "<p>This example shows various useful utilities that can be used to improve atom's pipelines.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/utilities/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/utilities/#use-the-utility-attributes", "title": "Use the utility attributes\u00b6", "text": ""}, {"location": "examples/utilities/#use-the-stats-method-to-assess-changes-in-the-dataset", "title": "Use the stats method to assess changes in the dataset\u00b6", "text": ""}, {"location": "examples/utilities/#inspect-feature-distributions", "title": "Inspect feature distributions\u00b6", "text": ""}, {"location": "examples/utilities/#change-the-data-mid-pipeline", "title": "Change the data mid-pipeline\u00b6", "text": ""}, {"location": "examples/utilities/#get-an-overview-of-the-available-models", "title": "Get an overview of the available models\u00b6", "text": ""}, {"location": "examples/utilities/#use-a-custom-metric", "title": "Use a custom metric\u00b6", "text": ""}, {"location": "examples/utilities/#customize-the-estimators-parameters", "title": "Customize the estimator's parameters\u00b6", "text": ""}, {"location": "examples/utilities/#save-load", "title": "Save &amp; load\u00b6", "text": ""}, {"location": "user_guide/accelerating/", "title": "Accelerating pipelines", "text": "<p>For very large datasets, ATOM offers various ways to accelerate its pipeline:</p> <ul> <li>Run estimators on GPU</li> <li>Use a faster data engine</li> <li>Use a faster estimator engine</li> <li>Run processes in parallel</li> </ul> <p>Warning</p> <p>Performance improvements are usually noticeable for datasets larger  than ~5M rows. For smaller datasets, using other values than the default can even harm performance!</p>"}, {"location": "user_guide/accelerating/#gpu-acceleration", "title": "GPU acceleration", "text": "<p>Graphics Processing Units (GPUs) can significantly accelerate calculations for preprocessing steps or training machine learning models. Training models involves compute-intensive matrix multiplications and other operations that can take advantage of a GPU's massively parallel architecture. Training on large datasets can take hours to run on a single processor. However, if you offload those tasks to a GPU, you can reduce training time to minutes instead.</p> <p>Running transformers and models in atom using a GPU is as easy as initializing the instance with parameter <code>device=\"gpu\"</code>. The <code>device</code> parameter accepts any string that follows the SYCL_DEVICE_FILTER filter selector. Examples are:</p> <ul> <li>device=\"cpu\" (use CPU)</li> <li>device=\"gpu\" (use default GPU)</li> <li>device=\"gpu:0\" (use first GPU)</li> <li>device=\"gpu:1\" (use second GPU)</li> </ul> <p>Combine GPU acceleration with the cuml and sklearnex estimator engines. The XGBoost, LightGBM and CatBoost models come with their own GPU engine. Setting <code>device=\"gpu\"</code> is sufficient to accelerate them with GPU, regardless of the engine parameter.</p> <p>Warning</p> <p>ATOM does not support multi-GPU training. If there is more than one GPU on the machine and the <code>device</code> parameter does not specify which one to use, the first one is used by default.</p> <p>Example</p> <p> Train a model on a GPU yourself using SageMaker Studio Lab. Just click on the badge above and run the notebook! Make sure to choose the GPU compute type.</p>"}, {"location": "user_guide/accelerating/#data-acceleration", "title": "Data acceleration", "text": "<p>The data engine can be specified through the <code>engine</code> parameter, which takes a dict with a key <code>data</code> that accepts three values: numpy, pyarrow and modin.</p>"}, {"location": "user_guide/accelerating/#numpy", "title": "numpy", "text": "<p>ATOM uses <code>pandas</code> as the default library for data handling, which in turn, uses <code>numpy</code> for all data processing.</p>"}, {"location": "user_guide/accelerating/#pyarrow", "title": "pyarrow", "text": "<p>PyArrow is a library that provides a way to work with Apache Arrow memory structures. Apache Arrow is a cross-language, platform-independent, in-memory data format that provides an efficient and fast way to serialize and deserialize data. Pandas offers native integration with pyarrow, which atom uses when specifying the pyarrow data engine.</p> <p>Warning</p> <ul> <li>The pyarrow backend doesn't work for sparse datasets. If the   dataset has any sparse columns, an exception is raised.</li> <li>The LightGBM and XGBoost models don't support pyarrow   dtypes.</li> </ul>"}, {"location": "user_guide/accelerating/#modin", "title": "modin", "text": "<p>The modin library is a multi-threading, drop-in replacement for pandas, that uses Ray as backend.</p>"}, {"location": "user_guide/accelerating/#estimator-acceleration", "title": "Estimator acceleration", "text": "<p>The estimator engine can be specified through the <code>engine</code> parameter, which takes a dict with a key <code>estimator</code> that accepts three values: sklearn, sklearnex and cuml. Read here how to run the estimators on GPU instead of CPU.</p> <p>Warning</p> <p>Estimators accelerated with sklearnex or cuML sometimes use slightly different hyperparameters than their sklearn counterparts.</p>"}, {"location": "user_guide/accelerating/#sklearn", "title": "sklearn", "text": "<p>This is the default option, which uses the standard estimators from sklearn. Sklearn does not support training on GPU.</p>"}, {"location": "user_guide/accelerating/#sklearnex", "title": "sklearnex", "text": "<p>The Intel\u00ae Extension for Scikit-learn package (or sklearnex, for brevity) accelerates sklearn models and transformers, keeping full conformance with sklearn's API. Sklearnex is a free software AI accelerator that offers a way to make sklearn code 10\u2013100 times faster. The software acceleration is achieved through the use of vector instructions, IA hardware-specific memory optimizations, threading, and optimizations for all upcoming Intel platforms at launch time. See here an example using the sklearnex engine.</p> <p>Warning</p> <p>sklearnex estimators don't support sparse datasets nor multioutput tasks.</p> <p>Tip</p> <p>Intel\u00ae processors provide better performance than other CPUs.</p>"}, {"location": "user_guide/accelerating/#prerequisites", "title": "Prerequisites", "text": "<ul> <li>Operating System:<ul> <li>Linux (Ubuntu, Fedora, etc...)</li> <li>Windows 8.1+</li> <li>macOS (no GPU support)</li> </ul> </li> <li>CPU:<ul> <li>Processor must have x86 architecture.</li> <li>Processor must support at least one of SSE2, AVX, AVX2, AVX512 instruction sets.</li> <li>ARM* architecture is not supported.</li> </ul> </li> <li>GPU:<ul> <li>All Intel\u00ae integrated and discrete GPUs.</li> <li>Intel\u00ae GPU drivers.</li> </ul> </li> <li>Libraries:<ul> <li>sklearnex&gt;=2023.2.1 (automatically installed with atom when the processor has x86 architecture)</li> <li>dpcpp_cpp_rt&gt;=2023.2  (only for GPU acceleration)</li> </ul> </li> </ul>"}, {"location": "user_guide/accelerating/#supported-estimators", "title": "Supported estimators", "text": "<ul> <li>Pruner (only for strategy=\"dbscan\")</li> <li> <p>FeatureSelector (only for strategy=\"pca\" and dense datasets)</p> </li> <li> <p>ElasticNet (only for CPU acceleration)</p> </li> <li>KNearestNeighbors</li> <li>Lasso (only for CPU acceleration)</li> <li>LogisticRegression</li> <li>OrdinaryLeastSquares</li> <li>RandomForest</li> <li>Ridge (only for regression tasks and CPU acceleration)</li> <li>SupportVectorMachine (GPU acceleration only supports classification tasks)</li> </ul>"}, {"location": "user_guide/accelerating/#cuml", "title": "cuML", "text": "<p>cuML is the machine learning library of the RAPIDS project. cuML enables you to run traditional tabular ML tasks on GPUs without going into the details of CUDA programming. For large datasets, these GPU-based implementations can complete 10-50x faster than their CPU equivalents.</p> <p>Warning</p> <ul> <li>cuML estimators don't support multioutput tasks nor the pyarrow   data engine.</li> <li>Install cuML using <code>pip install --extra-index-url=https://pypi.nvidia.com   cuml-cu11</code> or <code>pip install --extra-index-url=https://pypi.nvidia.com   cuml-cu12</code> depending on your CUDA version. Read more about RAPIDS'   installation here.</li> </ul> <p>Tip</p> <p>Only transformers and predictors are converted to the requested engine. To use a metric from cuML, insert it directly in the <code>run</code> method:</p> <pre><code>from atom import ATOMClassifier\nfrom cuml.metrics import accuracy_score\nfrom sklearn.datasets import make_classification\n\nX, y = make_classification(n_samples=100, random_state=1)\n\natom = ATOMClassifier(X, y, engine={\"estimator\": \"cuml\"}, verbose=2)\natom.run(\"LR\", metric=accuracy_score)\n</code></pre>"}, {"location": "user_guide/accelerating/#prerequisites_1", "title": "Prerequisites", "text": "<ul> <li>Operating System:<ul> <li>Ubuntu 18.04/20.04 or CentOS 7/8 with gcc/++ 9.0+</li> <li>Windows 10+ with WSL2 (see here a tutorial)</li> </ul> </li> <li>GPU:<ul> <li>NVIDIA Pascal\u2122 or better with compute capability 6.0+</li> </ul> </li> <li>Drivers:<ul> <li>CUDA &amp; NVIDIA Drivers of versions 11.0, 11.2, 11.4 or 11.5</li> </ul> </li> <li>Libraries:<ul> <li>cuML&gt;=23.08</li> </ul> </li> </ul>"}, {"location": "user_guide/accelerating/#supported-estimators_1", "title": "Supported estimators", "text": "<ul> <li>Cleaner</li> <li>Discretizer</li> <li>Imputer (only for strat_num!=\"knn\")</li> <li>Normalizer</li> <li>Pruner (only for strategy=\"dbscan\" and \"hdbscan\")</li> <li>Scaler</li> <li>Vectorizer</li> <li> <p>FeatureSelector (only for strategy=\"pca\")</p> </li> <li> <p>BernoulliNB</p> </li> <li>CategoricalNB</li> <li>ElasticNet</li> <li>GaussianNB</li> <li>KNearestNeighbors</li> <li>Lasso</li> <li>LinearSVM</li> <li>LogisticRegression</li> <li>MultinomialNB</li> <li>OrdinaryLeastSquares</li> <li>RandomForest</li> <li>Ridge (only for regression tasks)</li> <li>SupportVectorMachine</li> </ul>"}, {"location": "user_guide/accelerating/#parallel-execution", "title": "Parallel execution", "text": "<p>Another way to accelerate your pipelines is executing processes in parallel. Use the <code>backend</code> parameter to select one of several parallelization backends.</p> <ul> <li>loky: Used by default, can induce some communication and memory overhead   when exchanging input and output data with the worker Python processes. On   some rare systems (such as Pyiodide), the loky backend may not be available.</li> <li>multiprocessing: Previous process-based backend based on <code>multiprocessing.Pool</code>.   Less robust than loky.</li> <li>threading: Very low-overhead backend but it suffers from the Python Global   Interpreter Lock if the called function relies a lot on Python objects. It's    mostly useful when the execution bottleneck is a compiled extension that   explicitly releases the GIL (for instance a Cython loop wrapped in a \"with nogil\"   block or an expensive call to a library such as numpy).</li> <li>ray: Ray is an open-source unified compute framework   that makes it easy to scale AI and Python workloads. Read more about Ray here.   See here an example use case.</li> </ul> <p>The parallelization backend is applied in the following cases:</p> <ul> <li>In every individual estimator that uses parallelization internally.</li> <li>To calculate cross-validated results during hyperparameter tuning.</li> <li>To train multiple models in parallel (when the trainer's <code>parallel</code> parameter is True).</li> <li>To calculate partial dependencies in plot_partial_dependence.</li> </ul> <p>Note</p> <p>The <code>njobs</code> parameter sets the number of cores for the individual models as well as for parallel training. You won't gain much training two models in parallel with 2 cores, when the models also parallelize computations internally. Instead, use parallel training for models that can't parallelize their training (their constructor doesn't have the <code>n_jobs</code> parameter).</p>"}, {"location": "user_guide/data_cleaning/", "title": "Data cleaning", "text": "<p>More often than not, you'll need to do some data cleaning before fitting your dataset to a model.  Usually, this involves importing different libraries and writing many lines of code. Since ATOM is all about fast exploration  and experimentation, it provides various data cleaning classes to apply the most common transformations fast and easy.</p> <p>Note</p> <ul> <li>All of atom's data cleaning methods automatically adopt the relevant   transformer attributes (<code>n_jobs</code>, <code>verbose</code>, <code>logger</code>, <code>random_state</code>)   from atom. A different choice can be added as parameter to the method   call, e.g., <code>atom.scale(verbose=2)</code>.</li> <li>Like the add method, the data cleaning methods   accept the <code>columns</code> parameter to only transform a subset of the   dataset's features, e.g., <code>atom.scale(columns=[0, 1])</code>. Read   more in the row and column selection section.</li> </ul> <p></p>"}, {"location": "user_guide/data_cleaning/#balancing-the-data", "title": "Balancing the data", "text": "<p>One of the common issues found in datasets that are used for classification is imbalanced classes. Data imbalance usually reflects an unequal distribution of classes within a dataset. For example, in a credit card fraud detection dataset, most of the transactions are non-fraud, and a very few cases are fraud. This leaves us with a very unbalanced ratio of fraud vs non-fraud cases. The Balancer class can oversample the minority class or undersample the majority class using any of the transformers implemented in the imblearn package. It can be  accessed from atom through the balance method.</p> <p></p>"}, {"location": "user_guide/data_cleaning/#standard-data-cleaning", "title": "Standard data cleaning", "text": "<p>There are many data cleaning steps that are useful to perform on any dataset before modeling. These are general rules that apply almost on every use-case and every task. The Cleaner class is a convenient tool to apply such steps. It can be accessed from atom through the clean method. Use the class' parameters to choose which transformations to perform. The available steps are:</p> <ul> <li>Drop columns with specific data types.</li> <li>Strip categorical features from white spaces.</li> <li>Drop duplicate rows.</li> <li>Drop rows with missing values in the target column.</li> <li>Encode the target column.</li> </ul> <p></p>"}, {"location": "user_guide/data_cleaning/#binning-numerical-features", "title": "Binning numerical features", "text": "<p>Discretization (otherwise known as quantization or binning) provides a way to partition continuous features into discrete values. Certain datasets with continuous features may benefit from discretization, because discretization can transform the dataset of continuous attributes to one with only nominal attributes. Discretization is similar to constructing histograms for continuous data. However, histograms focus on counting features which fall into particular bins, whereas discretization focuses on assigning feature values to these bins. The Discretizer class can be used to bin continuous data into intervals. It can be accessed from atom through the discretize method.</p> <p></p>"}, {"location": "user_guide/data_cleaning/#encoding-categorical-features", "title": "Encoding categorical features", "text": "<p>Many datasets contain categorical features. Their variables are typically stored as text values which represent various classes. Some examples include color (\u201cRed\u201d, \u201cYellow\u201d, \u201cBlue\u201d), size (\u201cSmall\u201d, \u201cMedium\u201d, \u201cLarge\u201d) or geographic designations (city or country). Regardless of what the value is used for, the challenge is determining how to use this data in the analysis. The majority of sklearn's models don't support direct manipulation of this kind of data. Use the Encoder class to encode categorical features to numerical values. It can be  accessed from atom through the encode method.</p> <p>There are many strategies to encode categorical columns. The Encoder class applies one strategy or another depending on the number of classes in the column to be encoded. When there are only two, the values are encoded with 0 or 1. When there are more than two, the columns can be encoded using one-hot encoding or any other strategy of the category-encoders package, depending on the value of the <code>max_onehot</code> parameter. One-hot encodes the column making a dummy feature for every class. This approach preserves all the information but increases the size of the dataset considerably, making it often an undesirable strategy for high cardinality features. Other strategies like Target transform the column in place.</p> <p></p>"}, {"location": "user_guide/data_cleaning/#imputing-missing-values", "title": "Imputing missing values", "text": "<p>For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with ATOM's models which assume that all values in an array are numerical, and that all have and hold meaning. The Imputer class handles missing values in the dataset by either dropping or imputing the value. It can be accessed from atom through the impute method.</p> <p></p>"}, {"location": "user_guide/data_cleaning/#normalizing-the-feature-set", "title": "Normalizing the feature set", "text": "<p>Use the Normalizer class to transform the feature set to follow a Normal (Gaussian)-like distribution. In general, data must be transformed when using models that assume normality in the residuals. Examples of such models are LogisticRegression, LinearDiscriminantAnalysis and GaussianNB. The class can be accessed from atom through the normalize method.</p> <p></p>"}, {"location": "user_guide/data_cleaning/#handling-outliers", "title": "Handling outliers", "text": "<p>When modeling, it is important to clean the data sample to ensure that the observations best represent the problem. Sometimes a dataset can contain extreme values that are outside the range of what is expected and unlike the other data. These are called outliers. Often, machine learning modeling and model skill in general can be improved by  understanding and even removing these outlier samples. The Pruner class offers 7 different strategies to detect outliers (described hereunder). It can be accessed from atom through the prune method.</p> <p>z-score The z-score of a value in the dataset is defined as the number of standard deviations by which the value is above or below the mean of the column. Values above or below a certain threshold (specified with the parameter <code>max_sigma</code>) are considered outliers. Note that, contrary to the rest of the strategies, this approach selects outlier values, not outlier samples! Because of this, it is possible to replace the outlier value instead of dropping the entire sample.</p> <p>Isolation Forest Uses a tree-based anomaly detection algorithm. It is based on modeling the normal data in such a way as to isolate anomalies that are both few and different in the feature space. Read more in sklearn's documentation.</p> <p>Elliptic Envelope If the input variables have a Gaussian distribution, then simple statistical methods can be used to detect outliers. For example, if the dataset has two input variables and both are Gaussian, the feature space forms a multidimensional Gaussian, and knowledge of this distribution can be used to identify values far from the distribution. This approach can be generalized by defining a hypersphere (ellipsoid) that covers the normal data, and data that falls outside this shape is considered an outlier. Read more in sklearn's documentation.</p> <p>Local Outlier Factor A simple approach to identifying outliers is to locate those examples that are far from the other examples in the feature space. This can work well for feature spaces with low dimensionality (few features) but becomes less reliable as the number of features is increased. The local outlier factor is a technique that attempts to harness the idea of nearest neighbors for outlier detection. Each example is assigned a score of how isolated or how likely it is to be outliers based on the size of its local neighborhood. Those examples with the largest score are more likely to be outliers. Read more in sklearn's documentation.</p> <p>One-class SVM The support vector machine algorithm, initially developed for binary classification tasks, can also be used for one-class classification. When modeling one class, the algorithm captures the density of the majority class and classifies examples on the extremes of the density function as outliers. This modification of SVM is referred to as One-Class SVM. Read more in sklearn's documentation.</p> <p>DBSCAN The DBSCAN algorithm views clusters as areas of high density separated by areas of low density. Due to this rather generic view, clusters found by DBSCAN can be any shape, as opposed to k-means which assumes that clusters are convex shaped. Samples that lie outside any cluster are considered outliers. Read more in sklearn's documentation.</p> <p>OPTICS The OPTICS algorithm shares many similarities with the DBSCAN algorithm, and can be considered a generalization of DBSCAN that relaxes the <code>eps</code> requirement from a single value to a value range. The key difference between DBSCAN and OPTICS is that the OPTICS algorithm builds a reachability graph, and a spot within the cluster ordering. These two attributes are assigned when the model is fitted, and are used to determine cluster membership. Read more in sklearn's documentation.</p> <p></p>"}, {"location": "user_guide/data_cleaning/#scaling-the-feature-set", "title": "Scaling the feature set", "text": "<p>Standardization of a dataset is a common requirement for many machine learning estimators; they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with zero mean and unit variance). The Scaler class let you quickly scale atom's dataset using one of sklearn's scalers. It can be accessed from atom through the scale method. </p> <p>Info</p> <p>All strategies can utilize GPU speed-up. Click here for further information about GPU acceleration.</p>"}, {"location": "user_guide/data_management/", "title": "Data management", "text": ""}, {"location": "user_guide/data_management/#data-sets", "title": "Data sets", "text": "<p>ATOM is designed to work around one single dataset: the one with which atom is initialized. This is the dataset you want to explore, transform, and use for model training and validation. ATOM differentiates three different data sets:</p> <ul> <li>The training set is usually the largest of the data sets. As the   name suggests, this set is used to train the pipeline. During   hyperparameter tuning, only the training set is used to fit and   evaluate the estimator in every call. The training set in the current   branch can be accessed through the <code>train</code> attribute. It's   features and target can be accessed through <code>X_train</code> and <code>y_train</code>   respectively.</li> <li>The test set is used to evaluate the models. The model scores on   this set give an indication on how the model performs on new data. The   test set can be accessed through the <code>test</code> attribute. It's features   and target can be accessed through <code>X_test</code> and <code>y_test</code> respectively.</li> <li>The holdout set is an optional, separate set that should only be   used to evaluate the final model's performance. Create this set when   you are going to use the test set for an intermediate validation step.   The holdout set is immediately set apart during initialization and is   not considered part of atom's dataset (the <code>dataset</code> attribute only   returns the train and test sets). The holdout set is left untouched   until predictions are made on it, i.e., it does not undergo any pipeline   transformations until the data set is requested for the first time.   The holdout set is stored in atom's <code>holdout</code> attribute. See   herean example that shows how to use the holdout   data set.</li> </ul> <p>The data can be provided in different formats. If the data sets are not specified beforehand, you can input the features and target separately or together:</p> <ul> <li>X</li> <li>X, y</li> </ul> <p>Remember to use the <code>y</code> parameter to indicate the target column in X when using the first option. If not specified, the last column in X is used as target. In both these cases, the size of the sets are defined using the <code>test_size</code> and <code>holdout_size</code> parameters. Note that the splits are made after the subsample of the dataset with the <code>n_rows</code> parameter (when not left to its default value).</p> <p>If you already have the separate data sets, provide them using one of the following formats:</p> <ul> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>The input data is always converted internally to a dataframe, if it isn't one already. The column names should always be strings. If they are not, atom changes their type at initialization. If no column names are provided, default names are given of the form <code>X[N-1]</code>, where N stands for the n-th feature in the dataset.</p> <p></p>"}, {"location": "user_guide/data_management/#indexing", "title": "Indexing", "text": "<p>By default, atom resets the dataframe's index after initialization and after every transformation in the pipeline. To avoid this, specify the <code>index</code> parameter. If the dataset has an 'identifier' column, it is useful to use it as index for two reasons:</p> <ul> <li>An identifier doesn't usually contain any useful information   on the target column, and should therefore be removed before training.</li> <li>Predictions of specific rows can be accessed through their index.</li> </ul> <p>Warning</p> <p>Avoid duplicate indices in the dataframe. Having them raises an error when initializing atom and may potentially lead to unexpected behavior if introduced later.</p> <p></p>"}, {"location": "user_guide/data_management/#sparse-datasets", "title": "Sparse datasets", "text": "<p>If atom is initialized using a scipy sparse matrix, it is converted internally to a dataframe of sparse columns. Read more about pandas' sparse data structures here. The same conversion takes place when a transformer returns a sparse matrix, like for example, the Vectorizer.</p> <p>Note that ATOM considers a dataset to be sparse if any of the columns is sparse. A dataset can only benefit from sparsity when all its columns are sparse, hence mixing sparse and non-sparse columns is not recommended and can cause estimators to decrease their training speed or even crash. Use the shrink method to convert dense features to sparse and the available_models method to check which models have native support for sparse matrices.</p> <p>Click here to see an example that uses sparse data.</p> <p>Warning</p> <p>Estimators accelerated using sklearnex don't support sparse  datasets.</p> <p></p>"}, {"location": "user_guide/data_management/#multioutput-tasks", "title": "Multioutput tasks", "text": "<p>Multioutput is a task where there are more than one target column, i.e., the goal is to predict multiple targets at the same time. When providing a dataframe as target, use the y parameter. Providing <code>y</code> without keyword makes ATOM think you are providing <code>train, test</code> (see the data sets section).</p>"}, {"location": "user_guide/data_management/#task-types", "title": "Task types", "text": "<p>ATOM recognizes four multioutput tasks.</p> <p>Note</p> <p>Combinations of binary and multiclass target columns are treated as multiclass-multioutput tasks.</p>"}, {"location": "user_guide/data_management/#multilabel", "title": "Multilabel", "text": "<p>Multilabel is a classification task, labeling each sample with <code>m</code> labels from <code>n_classes</code> possible classes, where <code>m</code> can be 0 to <code>n_classes</code> inclusive. This can be thought of as predicting properties of a sample that are not mutually exclusive.</p> <p>For example, prediction of the topics relevant to a text document. The document may be about one of religion, politics, finance or education, several of the topic classes or all of the topic classes. The target column (<code>atom.y</code>) could look like this:</p> <pre><code>0                        [politics]\n1               [religion, finance]\n2    [politics, finance, education]\n3                                []\n4                         [finance]\n5               [finance, religion]\n6                         [finance]\n7               [religion, finance]\n8                       [education]\n9     [finance, religion, politics]\n\nName: target, dtype: object\n</code></pre> <p>A model can not directly ingest a variable amount of target classes. Use the clean method to assign a binary output to each class, for every sample. Positive classes are indicated with 1 and negative classes with 0. It is thus comparable to running n_classes binary classification tasks. In our example, the target (<code>atom.y</code>) is converted to:</p> <pre><code>   education  finance  politics  religion\n0          0        0         1         0\n1          0        1         0         1\n2          1        1         1         0\n3          0        0         0         0\n4          0        1         0         0\n5          0        1         0         1\n6          0        1         0         0\n7          0        1         0         1\n8          1        0         0         0\n9          0        1         1         1\n</code></pre>"}, {"location": "user_guide/data_management/#multiclass-multioutput", "title": "Multiclass-multioutput", "text": "<p>Multiclass-multioutput (also known as multitask classification) is a classification task which labels each sample with a set of non-binary properties. Both the number of properties and the number of classes per property is greater than 2. A single estimator thus handles several joint classification tasks. This is both a generalization of the multilabel classification task, which only considers binary attributes, as well as a generalization of the multiclass classification task, where only one property is considered.</p> <p>For example, classification of the properties \"type of fruit\" and \"colour\" for a set of images of fruit. The property \"type of fruit\" has the possible classes: \"apple\", \"pear\" and \"orange\". The property \"colour\" has the possible classes: \"green\", \"red\", \"yellow\" and \"orange\". Each sample is an image of a fruit, a label is output for both properties and each label is one of the possible classes of the corresponding property.</p>"}, {"location": "user_guide/data_management/#multioutput-regression", "title": "Multioutput regression", "text": "<p>Multioutput regression predicts multiple numerical properties for each sample. Each property is a numerical variable and the number of properties to be predicted for each sample is &gt;= 2. Some estimators that support multioutput regression are faster than just running n_output estimators.</p> <p>For example, prediction of both wind speed and wind direction, in degrees, using data obtained at a certain location. Each sample would be data obtained at one location and both wind speed and direction would be output for each sample.</p>"}, {"location": "user_guide/data_management/#multivariate", "title": "Multivariate", "text": "<p>Multivariate is the multioutput task for forecasting. In this case, we try to forecast more than one time series at the same time.</p> <p>Although all forecasting models in ATOM support multivariate tasks, we differentiate two types of models:</p> <ul> <li>The \"native multivariate\" models apply forecasts where every prediction   of endogeneous (<code>y</code>) variables will depend on values of the other target   columns.</li> <li>The rest of the models apply an estimator per column, meaning that forecasts   will be made per endogeneous variable, and not be affected by other variables.   To access the column-wise estimators, use the estimator's <code>forecasters_</code>   parameter, which stores the fitted forecasters in a dataframe.</li> </ul> <p>Read more about time series tasks here.</p>"}, {"location": "user_guide/data_management/#native-multioutput-models", "title": "Native multioutput models", "text": "<p>Some models have native support for multioutput tasks. This means that the original estimator is used to make predictions directly on all the target columns. Examples of such models are KNearestNeighbors, RandomForest and ExtraTrees.</p>"}, {"location": "user_guide/data_management/#non-native-multioutput-models", "title": "Non-native multioutput models", "text": "<p>The majority of the models don't have integrated support for multioutput tasks. However, it's possible to still use them for such tasks, wrapping them in a meta-estimator capable of handling multiple target columns. For non-native multioutput models, ATOM does so automatically. For multilabel tasks, the meta-estimator is:</p> <ul> <li>ClassifierChain</li> </ul> <p>And for multiclass-multioutput and multioutput regression, the meta-estimators are respectively:</p> <ul> <li>MultioutputClassifier</li> <li>MultioutputRegressor</li> </ul> <p>Warning</p> <p>Currently, scikit-learn metrics do not support multiclass-multioutput classification tasks. In this case, ATOM calculates the mean of the selected metric over every individual target.</p> <p>Tip</p> <ul> <li>Set the <code>native_multilabel</code> or <code>native_multioutput</code> parameter in ATOMModel equal to <code>True</code> to ignore the meta-estimator for custom models.</li> <li>Check out the multilabel classification and multioutput regression examples.</li> </ul> <p></p>"}, {"location": "user_guide/data_management/#branches", "title": "Branches", "text": "<p>You might want to compare how a model performs on a dataset transformed through multiple pipelines, each using different transformers. For example, on one pipeline with an undersampling strategy and the other with an oversampling strategy. To be able to do this, ATOM has a branching system.</p> <p>The branching system helps the user to manage multiple data pipelines within the same atom instance. Branches are created and accessed through atom's <code>branch</code> property. A branch contains a specific pipeline, the dataset transformed through that pipeline, and all data and utility attributes that refer to that dataset. Transformers and models called from atom use the dataset in the current branch, as well as data attributes such as <code>atom.dataset</code>. It's not allowed to change the data in a branch after fitting a model with it. Instead, create a new branch for every unique pipeline.</p> <p>By default, atom starts with one branch called \"main\". To start a new branch, set a new name to the property, e.g., <code>atom.branch = \"undersample\"</code>. This creates a new branch from the current one. To create a branch from any other branch type \"_from_\" between the new name and the branch from which to split, e.g., <code>atom.branch = \"oversample_from_main\"</code> creates branch \"oversample\" from branch \"main\", even if the current branch is \"undersample\". To switch between existing branches, just type the name of the desired branch, e.g., <code>atom.branch = \"main\"</code> brings you back to the main branch. Note that every branch contains a unique copy of the whole dataset! Creating many branches can cause memory issues for large datasets.</p> <p>See the Imbalanced datasets or Feature engineering examples for branching use cases.</p> <p>Warning</p> <p>Always create a new branch if you want to change the dataset after fitting a model! Forcing a data change through the data property's <code>@setter</code> can cause unexpected model behavior and break down the plotting methods.</p> <p></p> <p> </p> Figure 1. Diagram of a possible branch system to compare an oversampling with an undersampling pipeline. <p></p>"}, {"location": "user_guide/data_management/#memory-considerations", "title": "Memory considerations", "text": "<p>An atom instance stores one copy of the dataset for each branch (this doesn't include the holdout set, which is only stored once), and one copy of the initial dataset with which the instance is initialized. This copy of the original dataset is necessary to avoid data leakage during hyperparameter tuning and for some specific methods like cross_validate and reset. It's created as soon as there are no branches in the initial state (usually after calling the first data transformation). If the dataset is occupying too much memory, consider using the shrink method to convert the dtypes to their smallest possible matching dtype.</p> <p>When working with large datasets and multiple branches, it becomes impossible to store all branches in memory at the same time. To avoid out-of-memory errors, use atom's <code>memory</code> parameter. If not <code>False</code>, atom saves the data of inactive branches as well as the original branch at the specified location (in a directory called <code>joblib</code>, the name of the underlying library managing the caching), maintaining only the current active branch in memory. This mechanism results in a slight drop in performance because of the I/O overhead, but can save a lot of memory. Additionally, the memory's location is also used to cache the output of the <code>fit</code> method of transformers in the pipeline. See here an example using the memory parameter.</p> <p>Apart from the dataset itself, a model's metric scores and shap values are also stored as attributes of the model to avoid having to recalculate them every time they are needed. You can delete all these attributes using the clear method in order to free some memory before saving atom.</p> <p></p>"}, {"location": "user_guide/data_management/#data-transformations", "title": "Data transformations", "text": "<p>Performing data transformations is a common requirement of many datasets before they are ready to be ingested by a model. ATOM provides various classes to apply data cleaning and feature engineering transformations to the data. This tooling should be able to help you apply most of the typically needed transformations to get the data ready for modeling. For further fine-tuning, it's also possible to transform the data using custom transformers (see the add method) or through a function (see the apply method). Remember that all transformations are only applied to the dataset in the current branch.</p>"}, {"location": "user_guide/data_management/#row-and-column-selection", "title": "Row and column selection", "text": "<p>Many methods in atom contain the <code>rows</code> or <code>columns</code> parameter to select a subset of the dataset. Examples are the evaluate and save_data methods for <code>rows</code>, and the distribution and shrink methods for <code>columns</code>. All data cleaning and feature engineering methods use the <code>columns</code> parameter to apply the transformation only to that selection of columns, and all prediction methods use the <code>rows</code> parameter to make predictions on that selection of rows.</p> <p>As you can see, these two parameters are very important and shared across many methods in atom. Rows and columns can be selected in multiple ways. The check is performed in the order described hereunder:</p> <ol> <li>By actual dataset, e.g., <code>rows=atom.test</code> is equal to <code>rows=\"test\"</code>.</li> <li>By range or slice, e.g., <code>rows=range(100)</code> to select the first 100    rows from the dataset or <code>rows=slice(20, 100)</code> to select rows 20 to 99.</li> <li>By exact name, e.g., <code>rows=[\"row1\", \"row2\"]</code> to select rows with    indices <code>row1</code> and <code>row2</code> or <code>columns=[\"col1\", \"col2\"]</code> to select    columns <code>col1</code> and <code>col2</code>. It's also possible to use the <code>+</code> sign to select    multiple rows or columns, e.g., <code>columns=\"col1+col2</code> is the same    as <code>columns=[\"col1\", \"col2\"]</code>.</li> <li>By position, e.g., <code>rows=[0, 1, 2]</code> to select the first three rows.</li> <li>By name of the data set (only for rows), e.g., <code>rows=\"train\"</code> to    select all rows in the training set, or <code>rows=\"test+holdout\"</code> to    select all rows in the test and holdout sets. Valid data sets are <code>dataset</code>,    <code>train</code>, <code>test</code> and <code>holdout</code>.</li> <li>By dtype (only for columns), e.g., <code>columns=\"number\"</code> to select only     numerical columns. See pandas' user guide.</li> <li>By regex match, e.g., <code>columns=\"mean_.*\"</code> to select all columns    starting with <code>mean_</code>.</li> <li>Excluding instead of including using the <code>!</code> sign, e.g. <code>columns=\"!col1\"</code>    to select all columns except <code>col1</code>. You can also exclude multiple rows or    columns like this <code>columns=[\"!col1\", \"!col2\"]</code> or this    <code>columns=\"!col1+!col2\"</code>. It's also possible to exclude data sets    for row selection, e.g., <code>columns=\"!train\"</code> or dtypes for column    selection, e.g., <code>columns=\"!number\"</code>. Note that if a column name    starts with <code>!</code>, the selection of that name will take priority over exclusion.    Rows and columns can only be included or excluded, and not both at the same    time. For example, this selection raises an exception <code>column=[\"col1\", \"!col2\"]</code>.</li> </ol> <p>Info</p> <p>In some plotting methods, it's possible to plot separate lines for different subsets of the rows. For example, to compare the results on the train and test set. For these cases, either provide a sequence to the <code>rows</code> parameter for every line you want to draw, e.g., <code>atom.plot_roc(rows=(\"train\", \"test\"))</code>, or provide a dictionary where the keys are the names of the sets (used in the legend) and the values are the corresponding selection of rows, selected using any of the aforementioned approaches, e.g, <code>atom.plot_roc(rows={\"0-99\": range(100), \"100-199\": range(100, 200})</code>. Note that for these methods, using <code>atom.plot_roc(rows=\"train+test\")</code>, only plots one line with the data from both sets. See the advanced plotting example.</p>"}, {"location": "user_guide/feature_engineering/", "title": "Feature engineering", "text": "<p>Feature engineering is the process of creating new features from the existing ones, in order to capture relationships with the target column that the first set of features didn't have on their own. This process is very important to improve the performance of machine learning algorithms. Although feature engineering works best when the data  scientist applies use-case specific transformations, there are ways to do this in an automated manner, without prior domain knowledge. One of the problems of creating new features without human expert intervention, is that many of the newly created features can be useless, i.e., they do not help the algorithm to make better predictions. Even worse, having useless features can drop your performance. To avoid this, we perform feature selection, a process in which we select the relevant features  in the dataset. See the Feature engineering example.</p> <p>Note</p> <ul> <li>All of atom's feature engineering methods automatically adopt the relevant   transformer attributes (<code>n_jobs</code>, <code>verbose</code>, <code>logger</code>, <code>random_state</code>) from   atom. A different choice can be added as parameter to the method call,   e.g., <code>atom.feature_selection(\"pca\", n_features=10, random_state=2)</code>.</li> <li>Like the add method, the feature engineering   methods accept the <code>columns</code> parameter to only transform a subset of the   dataset's features, e.g., <code>atom.feature_selection(\"pca\",n_features=10, columns=slice(5, 15))</code>. Read more in the   row and column selection section.</li> </ul> <p></p>"}, {"location": "user_guide/feature_engineering/#extracting-datetime-features", "title": "Extracting datetime features", "text": "<p>Features that contain dates or timestamps can not be directly ingested by models since they are not strictly numerical. Encoding them as categorical features is not an option since the encoding does not capture the relationship between the different moments in time. The FeatureExtractor class creates new features extracting datetime elements (e.g., day, month, year, hour...) from the columns. It can be accessed from atom through the feature_extraction method. The new features are named equally to the column from which they are extracted, followed by an underscore and the datetime element they create, e.g., <code>x0_day</code> for the day element of <code>x0</code>.</p> <p>Note that many time features have a cyclic pattern, e.g., after Sunday comes Monday. This means that if we would encode the days of the week from 0 to 6, we would lose that relation. A common method used to encode cyclical features is to transform the data into two dimensions using a sine and cosine transformation:</p> \\[ x_{sin} = sin\\left(\\frac{2\\pi * x}{max(x)}\\right) \\] \\[ x_{cos} = cos\\left(\\frac{2\\pi * x}{max(x)}\\right) \\] <p>The resulting features have their names followed by sin or cos, e.g. <code>x0_day_sin</code> and <code>x0_day_cos</code>. The datetime elements that can be encoded in a cyclic fashion are: microsecond, second, minute, hour, weekday, day, day_of_year, month and quarter. Note that decision trees based algorithms build their split rules according to one feature at a time. This means that they will fail to correctly process cyclic features since the sin/cos values are expected to be considered as one single coordinate system.</p> <p>Use the <code>fmt</code> parameter to specify your feature's format in case the column is categorical. The FeatureExtractor class will convert the column to the datetime dtype before extracting the specified features. Click here for an overview of the available formats.</p> <p></p>"}, {"location": "user_guide/feature_engineering/#generating-new-features", "title": "Generating new features", "text": "<p>The FeatureGenerator class creates new non-linear features based on the original feature set. It can be accessed from atom through the feature_generation method. You can choose between two strategies: Deep Feature Synthesis and Genetic Feature Generation.</p> <p>Deep Feature Synthesis Deep feature synthesis (DFS) applies the selected operators on the features in the dataset. For example, if the operator is \"log\", it will create the new feature <code>LOG(old_feature)</code> and if the operator is \"mul\", it will create the new feature <code>old_feature_1 x old_feature_2</code>. The operators can be chosen through the <code>operators</code> parameter. Choose from:</p> <ul> <li>add: Take the sum of two features.</li> <li>sub: Subtract two features from each other.</li> <li>mul: Multiply two features with each other.</li> <li>div: Divide two features with each other.</li> <li>abs: Calculate the absolute value of a feature.</li> <li>srqt: Calculate the square root of a feature.</li> <li>log: Calculate the natural logarithm of a feature.</li> <li>sin: Calculate the sine of a feature.</li> <li>cos: Calculate the cosine of a feature.</li> <li>tan: Calculate the tangent of a feature.</li> </ul> <p>ATOM's implementation of DFS uses the featuretools package.</p> <p></p> <p>Genetic Feature Generation Genetic feature generation (GFG) uses genetic programming, a branch of evolutionary programming, to determine which features are successful and create new ones based on those. Where dfs can be seen as some kind of \"brute force\" for feature engineering, gfg tries to improve its features with every generation of the algorithm. gfg uses the same operators as dfs, but instead of only applying the transformations once, it evolves them further, creating nested structures of combinations of features. The new features are given the name <code>feature_n</code>, where n stands for the n-th feature in the dataset. You can access the genetic feature's fitness and description (how they are calculated) through the <code>genetic_features</code> attribute.</p> <p>ATOM uses the SymbolicTransformer class from the gplearn package for the genetic algorithm. Read more about this implementation here.</p> <p></p>"}, {"location": "user_guide/feature_engineering/#grouping-similar-features", "title": "Grouping similar features", "text": "<p>When your dataset contains many similar features corresponding to a certain natural group or entity, it's possible to replace these features for a handful of them, that should capture the relations of the group, in order to lose as little information as possible. To achieve this, the FeatureGrouper class computes certain statistical properties that describe the group's distribution, like the mean or the median, and replaces the columns with the result of these statistical calculations over every row in the dataset. The goal of this approach is to reduce the number of columns in the dataset, avoiding the curse of dimensionality.</p> <p></p>"}, {"location": "user_guide/feature_engineering/#selecting-useful-features", "title": "Selecting useful features", "text": "<p>The FeatureSelector class provides tooling to select the relevant features from a dataset. It can be accessed from atom through the feature_selection method.</p> <p></p>"}, {"location": "user_guide/feature_engineering/#standard-strategies", "title": "Standard strategies", "text": "<p> Univariate Univariate feature selection works by selecting the best features based on univariate statistical F-test. The test is provided via the <code>solver</code> parameter. It takes any function taking two arrays (X, y), and returning arrays (scores, p-values). Read more in sklearn's documentation.</p> <p></p> <p> Principal Components Analysis Applying PCA reduces the dimensionality of the dataset by maximizing the variance of each dimension. The new features are called <code>pca0</code>, <code>pca1</code>, etc... PCA can be applied in three ways:</p> <ul> <li>If the data is dense (i.e., not sparse), the estimator used is PCA.   Before fitting the transformer, the data is scaled to mean=0 and std=1   if it wasn't already. Read more in sklearn's documentation.</li> <li>If the data is [sparse][sparse datasets] (often the case for term-document   matrices, see Vectorizer), the estimator used is TruncatedSVD.   Read more in sklearn's documentation.</li> <li>If <code>engine</code> is \"sklearnex\" or \"cuml\", the estimator   used is the package's PCA implementation. Sparse data is not supported for   neither engine.</li> </ul> <p></p> <p> Selection from model SFM uses an estimator with <code>feature_importances_</code> or <code>coef_</code> attributes to select the best features in a dataset based on importance weights. The estimator is provided through the <code>solver</code> parameter and can be already fitted. ATOM allows you to use one its predefined models, e.g., <code>solver=\"RF\"</code>. If you didn't call the FeatureSelector through atom, don't forget to indicate the estimator's task adding <code>_class</code> or <code>_reg</code> after the name, e.g., <code>RF_class</code> to use a random forest classifier. Read more in sklearn's documentation.</p> <p></p> <p> Sequential Feature Selection Sequential feature selection adds (forward selection) or removes (backward selection) features to form a feature subset in a greedy fashion. At each stage, this estimator chooses the best feature to add or remove based on the cross-validation score of an estimator. Read more in sklearn's documentation.</p> <p></p> <p> Recursive Feature Elimination Select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features, and the importance of each feature is obtained either through a <code>coef_</code> or through a <code>feature_importances_</code> attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. Note that, since RFE needs to fit the model again every iteration, this method can be fairly slow.</p> <p>RFECV applies the same algorithm as RFE but uses a cross-validated metric (under the scoring parameter, see RFECV) to assess every step's performance. Also, where RFE returns the number of features selected by <code>n_features</code>, RFECV returns the number of features that achieved the optimal score on the specified metric. Note that this is not always equal to the amount specified by <code>n_features</code>. Read more in sklearn's documentation.</p> <p></p>"}, {"location": "user_guide/feature_engineering/#advanced-strategies", "title": "Advanced strategies", "text": "<p>The following strategies are a collection of nature-inspired optimization algorithms that maximize an objective function. If not manually specified, the function calculates the cross-validated score of a model on the data. Use the <code>scoring</code> parameter (not present in description, part of kwargs) to specify the metric to optimize on.</p> <p></p> <p> Particle Swarm Optimization Particle Swarm Optimization (PSO) optimizes a problem by having a population of candidate solutions (particles), and moving them around in the search-space according to simple mathematical formula over the particle's position and velocity. Each particle's movement is influenced by its local best known position, but is also guided toward the best known positions in the search-space, which are updated as better positions are found by other particles. This is expected to move the swarm toward the best solutions. Read more here.</p> <p></p> <p> Harris Hawks Optimization Harris Hawks Optimization (HHO) mimics the action and reaction of Hawk's team collaboration hunting in nature and prey escaping to discover the solutions of the single-objective problem. Read more here.</p> <p></p> <p> Grey Wolf Optimization The Grey Wolf Optimizer (GWO) mimics the leadership hierarchy and hunting mechanism of grey wolves in nature. Four types of grey wolves such as alpha, beta, delta, and omega are employed for simulating the leadership hierarchy. In addition, three main steps of hunting, searching for prey, encircling prey, and attacking prey, are implemented to perform optimization. Read more here.</p> <p></p> <p> Dragonfly Optimization The Dragonfly Algorithm (DFO) algorithm originates from static and dynamic swarming behaviours. These two swarming behaviours are very similar to the two main phases of optimization using meta-heuristics: exploration and exploitation. Dragonflies create sub swarms and fly over different areas in a static swarm, which is the main objective of the exploration phase. In the static swarm, however, dragonflies fly in bigger swarms and along one direction, which is favourable in the exploitation phase. Read more here.</p> <p></p> <p> Genetic Optimization Genetic Optimization is a metaheuristic inspired by the process of natural selection that belongs to the larger class of evolutionary algorithms. Genetic algorithms are commonly used to generate high-quality solutions to optimization and search problems by relying on biologically inspired operators such as mutation, crossover and selection. Read more here.</p> <p></p>"}, {"location": "user_guide/feature_engineering/#other-selection-methods", "title": "Other selection methods", "text": "<p>Removing features with low or high variance Variance is the expectation of the squared deviation of a random variable from its mean. Features with low variance have many values repeated, which means the model can't learn much from them. In a similar way, features with very high variance have very few values repeated, which makes it also difficult for a model to learn from this feature.</p> <p>FeatureSelector removes a categorical feature when the maximum number of occurrences for any value is below <code>min_repeated</code> or when the same value is repeated in at least <code>max_repeated</code> fraction of the rows. The default option is to remove a feature if all values in it are either different or exactly the same.</p> <p></p> <p>Removing features with multi-collinearity Two features that are highly correlated are redundant, i.e., two will not contribute more to the model than only one of them. FeatureSelector will drop a feature that has a Pearson correlation coefficient larger than <code>max_correlation</code> with another feature. A correlation of 1 means the two columns are equal. A dataframe of the removed features and their correlation values can be accessed through the <code>collinear</code> attribute.</p>"}, {"location": "user_guide/introduction/", "title": "Introduction", "text": "<p>There is no magic formula in data science that can tell us which type of machine learning estimator in combination with which pipeline will perform best for a given raw dataset. Different models are better suited for different types of data and different types of problems. You can follow some rough guide on how to approach problems with regard to which model to try, but these are incomplete at best.</p> <p>During the exploration phase of a machine learning project, a data scientist tries to find the optimal pipeline for his specific use case. This usually involves applying standard data cleaning steps, creating or selecting useful features, trying out different models, etc. Testing multiple pipelines requires many lines of code, and writing it all in the same notebook often makes it long and cluttered. On the other hand, using multiple notebooks makes it harder to compare the results and to keep an overview. On top of that, refactoring the code for every test can be quite time-consuming. How many times have you conducted the same action to pre-process a raw dataset? How many times have you copy-and-pasted code from an old repository to re-use it in a new use case?</p> <p>Although best practices tell us to start with a simple model and build up to more complicated ones, many data scientists just use the model best known to them in order to avoid the aforementioned problems. This can result in poor performance (because the model is just not the right one for the task) or in inefficient management of time and computing resources (because a simpler/faster model could have achieved a similar performance).</p> <p>ATOM is here to help solve these common issues. The package acts as a wrapper of the whole machine learning pipeline, helping the data scientist to rapidly find a good model for his problem. Avoid endless imports and documentation lookups. Avoid rewriting the same code over and over again. With just a few lines of code, it's now possible to perform basic data cleaning steps, select relevant features and compare the performance of multiple models on a given dataset, providing quick insights on which pipeline performs best for the task at hand.</p> <p>It is important to realize that ATOM is not here to replace all the work a data scientist has to do before getting his model into production. ATOM doesn't spit out production-ready models just by tuning some parameters in its API. After helping you determine the right pipeline, you will most probably need to fine-tune it using use-case specific features and data cleaning steps in order to achieve maximum performance.</p> <p>Example steps taken by ATOM's pipeline:</p> <ol> <li>Data Cleaning<ul> <li>Handle missing values</li> <li>Encode categorical features</li> <li>Detect and remove outliers</li> <li>Balance the training set</li> </ul> </li> <li>Feature engineering<ul> <li>Create new non-linear features</li> <li>Select the most promising features</li> </ul> </li> <li>Train and validate multiple models<ul> <li>Apply hyperparameter tuning</li> <li>Fit the models on the training set</li> <li>Evaluate the results on the test set</li> </ul> </li> <li>Analyze the results<ul> <li>Get the scores on various metrics</li> <li>Make plots to compare the model performances</li> </ul> </li> </ol> <p></p> <p></p> Figure 1. Diagram of a possible pipeline created by ATOM."}, {"location": "user_guide/logging/", "title": "Logging &amp; Tracking", "text": ""}, {"location": "user_guide/logging/#logging", "title": "Logging", "text": "<p>To start logging your experiments, fill the <code>logger</code> parameter with the name or path to store the logging file. If automatic naming is used, the file is saved using the __name__ of the class followed by the timestamp of the logger's creation, e.g. <code>ATOMClassifier_11May21_20h11m03s</code>. The logging file contains method calls, all printed messages to stdout with maximum verbosity, and any exception raised during running. Additionally, the logging entries of external libraries are redirected to the same file handler.</p> <p></p>"}, {"location": "user_guide/logging/#tracking", "title": "Tracking", "text": "<p>ATOM uses MLflow Tracking as a backend API and UI for logging models, parameters, pipelines, data and plots. Start tracking your experiments assigning a name to the <code>experiment</code> parameter. Every model is tracked using a separate run. When no backend is configured, the data is stored locally at <code>./mlruns</code>. To configure the backend, use mlflow.set_tracking_uri in your notebook or IDE before initializing atom. This does not affect the currently active run (if one exists), but takes effect for successive runs. Run <code>mlflow ui</code> on your terminal to open MLflow's Tracking UI and  view it at http://localhost:5000.</p> <p>Note</p> <p>When using ATOM on Databricks, the experiment's name should include the complete path to the storage, e.g., <code>/Users/username@domain.com/experiment_name</code>.</p> <p>Example</p> <pre><code>from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, experiment=\"breast_cancer\")\natom.run(models=[\"LR\", \"RF\", \"LGB\"], n_trials=(0, 0, 10))\n</code></pre> <p></p> <p></p>"}, {"location": "user_guide/logging/#dagshub-integration", "title": "DAGsHub integration", "text": "<p>ATOM has a build-in integration with DAGsHub, a web platform based on open source tools, optimized for data science and oriented towards the open source community. To store your mlflow experiments in a DAGsHub repo, type <code>dagshub:&lt;experiment_name&gt;</code> in the <code>experiment</code> parameter (instead of just the experiment's name). If the repo does not already exist, a new public repo is created.</p> <p>Info</p> <p>If you are logged into your DAGsHub account when initializing atom with a dagshub experiment, a page on your web browser is automatically opened to give access permissions. If not, read here how to set up your DAGsHub credentials.</p> <p>Example</p> <pre><code>from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, experiment=\"dagshub:breast_cancer\")\natom.run(models=[\"LR\", \"RF\"])\n</code></pre> <p></p> <p></p>"}, {"location": "user_guide/logging/#tracked-elements", "title": "Tracked elements", "text": "<p>Tags The runs are automatically tagged with the model's full name, the branch from which the model was trained, and the time it took to fit the model. Add additional custom tags through the <code>ht_params</code> parameter, e.g.,  <code>atom.run([\"LR\", \"RF\"], ht_params={\"tags\": {\"tag1\": 1}})</code>.</p> <p>Parameters All parameters used by the estimator at initialization are tracked. Additional parameters passed to the fit method are not tracked.</p> <p>Model The model's estimator is stored as artifact. The estimator has to be compatible with the mlflow.sklearn, module.</p> <p>Hyperparameter tuning If hyperparameter tuning is performed, every trial is tracked as a nested run in the model's main run. This option can be switched off using atom's <code>log_ht</code> attribute, e.g., <code>atom.log_ht = False</code>. The data and pipeline options are never stored within nested runs.</p> <p>Metrics All metric results are tracked, not only during training, but also when the evaluate method is called at a later point. Metrics calculated during in-training validation are also stored.</p> <p>Dataset The train and test sets used to fit and evaluate the model can be stored as <code>.csv</code> files to the run's artifacts. This option can be switched on using atom's <code>log_data</code> attribute, e.g. <code>atom.log_data = True</code>.</p> <p>Pipeline The model's pipeline (returned from the export_pipeline method) can be stored as an artifact. This option can be switched on using atom's <code>log_pipeline</code> attribute, e.g., <code>atom.log_pipeline = True</code>.</p> <p>Plots By default, plots are stored as <code>.html</code> artifacts in all runs corresponding to the models that are showed in the plot. If the <code>filename</code> parameter is specified, they are stored under that name, else the method's name is used. This option can be switched off using atom's <code>log_plots</code> attribute, e.g., <code>atom.log_plots = False</code>.</p>"}, {"location": "user_guide/models/", "title": "Models", "text": ""}, {"location": "user_guide/models/#predefined-models", "title": "Predefined models", "text": "<p>ATOM provides many models for classification and regression tasks that can be used to fit the data in the pipeline. After fitting, a class containing the underlying estimator is attached to atom as an attribute. We refer to these \"subclasses\" as models. Apart from the estimator, the models contain a variety of attributes and methods that can help you understand how the underlying estimator performed. They can be accessed using their acronyms, e.g., <code>atom.LGB</code> to access the LightGBM model. The available models and their corresponding acronyms are:</p> <ul> <li>AdaBoost (AdaB)</li> <li>ARIMA (Arima)</li> <li>AutoARIMA (AutoARIMA)</li> <li>AutomaticRelevanceDetermination (ARD)</li> <li>Bagging (Bag)</li> <li>BayesianRidge (BR)</li> <li>BernoulliNB (BNB)</li> <li>CatBoost (CatB)</li> <li>CategoricalNB (CatNB)</li> <li>ComplementNB (CNB)</li> <li>DecisionTree (Tree)</li> <li>Dummy (Dummy)</li> <li>ElasticNet (EN)</li> <li>ETS (ETS)</li> <li>ExponentialSmoothing (ES)</li> <li>ExtraTree (ETree)</li> <li>ExtraTrees (ET)</li> <li>GaussianNB (GNB)</li> <li>GaussianProcess (GP)</li> <li>GradientBoostingMachine (GBM)</li> <li>HuberRegression (Huber)</li> <li>HistGradientBoosting (hGBM)</li> <li>KNearestNeighbors (KNN)</li> <li>Lasso (Lasso)</li> <li>LeastAngleRegression (Lars)</li> <li>LightGBM (LGB)</li> <li>LinearDiscriminantAnalysis (LDA)</li> <li>LinearSVM (lSVM)</li> <li>LogisticRegression (LR)</li> <li>MultiLayerPerceptron (MLP)</li> <li>MultinomialNB (MNB)</li> <li>NaiveForecaster (NF)</li> <li>OrdinaryLeastSquares (OLS)</li> <li>OrthogonalMatchingPursuit (OMP)</li> <li>PassiveAggressive (PA)</li> <li>Perceptron (Perc)</li> <li>PolynomialTrend (PT)</li> <li>QuadraticDiscriminantAnalysis (QDA)</li> <li>RadiusNearestNeighbors (RNN)</li> <li>RandomForest (RF)</li> <li>Ridge (Ridge)</li> <li>StochasticGradientDescent (SGD)</li> <li>SupportVectorMachine (SVM)</li> <li>XGBoost (XGB)</li> </ul> <p>Warning</p> <p>The model classes can not be initialized directly by the user! Use them only through atom.</p> <p>Tip</p> <p>The acronyms are case-insensitive, e.g., <code>atom.lgb</code> also calls the LightGBM model.</p> <p></p>"}, {"location": "user_guide/models/#custom-models", "title": "Custom models", "text": "<p>It is also possible to create your own models in ATOM's pipeline. For example, imagine we want to use sklearn's RANSACRegressor estimator (note that is not included in ATOM's predefined models). There are two ways to achieve this:</p> <ul> <li>Using ATOMModel (recommended). With this approach you can pass   the required model characteristics to the pipeline.</li> </ul> <pre><code>&gt;&gt;&gt; from atom import ATOMRegressor, ATOMModel\n&gt;&gt;&gt; from sklearn.datasets import load_diabetes\n&gt;&gt;&gt; from sklearn.linear_model import RANSACRegressor\n\n&gt;&gt;&gt; ransac = ATOMModel(RANSACRegressor, name=\"RANSAC\", needs_scaling=True)\n\n&gt;&gt;&gt; X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y)\n&gt;&gt;&gt; atom.run(ransac)\n</code></pre> <ul> <li>Using the estimator's class or an instance of the class. This approach   will also call ATOMModel under the hood, but it will leave its   parameters to their default values.</li> </ul> <pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_diabetes\n&gt;&gt;&gt; from sklearn.linear_model import RANSACRegressor\n\n&gt;&gt;&gt; X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y)\n&gt;&gt;&gt; atom.run(RANSACRegressor)\n</code></pre> <p>Additional things to take into account:</p> <ul> <li>Custom models can be accessed through their acronym like any other model, e.g.   <code>atom.ransac</code> in the example above.</li> <li>Custom models are not restricted to sklearn estimators, but they should   follow sklearn's API, i.e., have a fit and predict method.</li> <li>Parameter customization (for the initializer) is only possible for   custom models which provide an estimator that has a <code>set_params()</code> method,   i.e., it's a child class of BaseEstimator.</li> <li>Hyperparameter tuning for custom models is ignored unless appropriate   dimensions are provided through <code>ht_params</code>.</li> </ul> <p></p>"}, {"location": "user_guide/models/#deep-learning", "title": "Deep learning", "text": "<p>Deep learning models can be used through ATOM's custom models as long as they follow sklearn's API. For example, models implemented with the Keras package should use the scikeras wrappers KerasClassifier or KerasRegressor.</p> <p>Many deep learning use cases, for example in computer vision, use datasets with more than 2 dimensions, e.g., image data can have shape (n_samples, length, width, rgb). Luckily, scikeras has a workaround to be able to work with such datasets. Learn with this example how to use ATOM to train and validate a Convolutional Neural Network on an image dataset.</p> <p>Warning</p> <p>Models implemented with keras can only use custom hyperparameter tuning when <code>n_jobs=1</code> or <code>ht_params={\"cv\": 1}</code>. Using n_jobs &gt; 1 and cv &gt; 1 raises a PicklingError due to incompatibilities of the APIs.</p> <p></p>"}, {"location": "user_guide/models/#ensembles", "title": "Ensembles", "text": "<p>Ensemble models use multiple estimators to obtain better predictive performance than could be obtained from any of the constituent learning algorithms alone. ATOM implements two ensemble techniques: voting and stacking. Click here to see an example that uses ensemble models.</p> <p>If the ensemble's underlying estimator is a model that used automated feature scaling, it's added as a Pipeline containing the <code>scaler</code> and estimator. If a mlflow experiment is active, the ensembles start their own run, just like the predefined models do.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p>"}, {"location": "user_guide/models/#voting", "title": "Voting", "text": "<p>The idea behind voting is to combine the predictions of conceptually different models to make new predictions. Such a technique can be useful for a set of equally well performing models in order to balance out their individual weaknesses. Read more in sklearn's documentation.</p> <p>A voting model is created from a trainer through the voting method. The voting model is added automatically to the list of models in the trainer, under the <code>Vote</code> acronym. The underlying estimator is a custom adaptation of VotingClassifier or VotingRegressor depending on the task. The differences between ATOM's and sklearn's implementation are:</p> <ul> <li>ATOM's implementation doesn't fit estimators if they're already fitted.</li> <li>ATOM's instance is considered fitted at initialization when all underlying   estimators are.</li> <li>ATOM's VotingClassifier doesn't implement a LabelEncoder to encode the   target column.</li> </ul> <p>The two estimators are customized in this way to save time and computational resources, since the classes are always initialized with fitted estimators. As a consequence of this, the VotingClassifier can not use sklearn's build-in LabelEncoder for the target column since it can't be fitted when initializing the class. For the vast majority of use cases, the changes will have no effect. If you want to export the estimator and retrain it on different data, just make sure to clone the underlying estimators first.</p> <p></p>"}, {"location": "user_guide/models/#stacking", "title": "Stacking", "text": "<p>Stacking is a method for combining estimators to reduce their biases. More precisely, the predictions of each individual estimator are stacked together and used as input to a final estimator to compute the prediction. Read more in sklearn's documentation.</p> <p>A stacking model is created from a trainer through the stacking method. The stacking model is added automatically to the list of models in the trainer, under the <code>Stack</code> acronym. The underlying estimator is a custom adaptation of StackingClassifier or StackingRegressor depending on the task. The only difference between ATOM's and sklearn's implementation is that ATOM's implementation doesn't fit estimators if they're already fitted. The two estimators are customized in this way to save time and computational resources, since the classes are always initialized with fitted estimators. For the vast majority of use cases, the changes will have no effect. If you want to export the estimator and retrain it on different data, just make sure to clone the underlying estimators first.</p>"}, {"location": "user_guide/nlp/", "title": "Natural Language Processing", "text": "<p>Natural Language Processing (NLP) is the subfield of machine learning that works with human language data. The nlp module contains four classes that help to convert raw text to meaningful numeric values, ready to be ingested by a model. ATOM uses the nltk library for the majority of its NLP processes.</p> <p>The text documents are expected to be provided in a column of the dataframe named <code>corpus</code> (the name is case-insensitive). Only the corpus is changed by the transformers, leaving the rest of the columns as is. This mechanism allows atom to combine datasets containing a text corpus with other non-text features. If an array is provided as input, it should consist of only one feature containing the text (one document per row). ATOM will then automatically convert the array to a dataframe with the desired column name. Documents are expected to be strings or sequences of words. Click here for an example using text data.</p> <p>Note</p> <p>All of atom's NLP methods automatically adopt the relevant transformer attributes (<code>verbose</code>, <code>logger</code>) from atom. A different choice can be added as parameter to the method call, e.g., <code>atom.tokenize(verbose=0)</code>.</p> <p>Info</p> <p>ATOM doesn't do topic modeling! The module's goal is to help process text documents into features that can be used for supervised learning.</p> <p></p>"}, {"location": "user_guide/nlp/#text-cleaning", "title": "Text cleaning", "text": "<p>Text data is rarely clean. Whether it's scraped from a website or inferred from paper documents, it's always populated with irrelevant information for the model, such as email addresses, HTML tags, numbers or punctuation marks. Use the TextCleaner class to clean the corpus from such noise. It can be accessed from atom through the textclean method. Use the class' parameters to choose which transformations to perform. The available steps are:</p> <ul> <li>Decode unicode characters to their ascii representations.</li> <li>Convert all characters to lower case.</li> <li>Drop email addresses from the text.</li> <li>Drop URL links from the text.</li> <li>Drop HTML tags from the text.</li> <li>Drop emojis from the text.</li> <li>Drop numbers from the text.</li> <li>Drop punctuations from the text.</li> </ul> <p></p>"}, {"location": "user_guide/nlp/#tokenization", "title": "Tokenization", "text": "<p>Some text processing algorithms, like stemming or lemmatization, require the corpus to be made out of tokens, instead of strings, in order to know what to consider as words. Tokenization is used to achieve this. It separates every document into a sequence of smaller units. In this case, the words.</p> <p>Sometimes, words have a different meaning on their own than when combined with adjacent words. For example, the word <code>new</code> has a completely different meaning when the word <code>york</code> is directly after it than when it's not. These combinations of two words are called bigrams. When there are three words, they are called trigrams, and with four words quadgrams.</p> <p>The Tokenizer class converts a document into a sequence of words, and can create the most frequent bigrams, trigrams and quadgrams. It can be accessed from atom through the tokenize method.</p> <p></p>"}, {"location": "user_guide/nlp/#text-normalization", "title": "Text Normalization", "text": "<p>Normalization for texts is a process that converts a list of words to a more uniform standard. This is useful to reduce the amount of different information that the computer has to deal with, and therefore improves efficiency. The goal of normalization techniques like stemming and lemmatization is to reduce inflectional and related forms of a word to a common base form.</p> <p>Normalize the words in the corpus using the TextNormalizer class. It can be accessed from atom through the textnormalize method.</p> <p></p>"}, {"location": "user_guide/nlp/#vectorization", "title": "Vectorization", "text": "<p>Text data cannot be fed directly to the algorithms themselves, as most of them expect numerical feature vectors with a fixed size, rather than words in the text documents with variable length. Vectorization is the general process of turning a collection of text documents into numerical feature vectors. You can apply it to the corpus using the Vectorizer class. It can be accessed from atom through the vectorize method.</p> <p>Info</p> <p>All strategies can utilize GPU speed-up. Click here for further information about GPU acceleration.</p> <p></p> <p>Bag of Words The Bag of Words (BOW) strategy applies tokenization, counting and normalization to the corpus. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document. The created columns are named with the words they are embedding with the prefix <code>corpus_</code>. Read more in sklearn's documentation.</p> <p></p> <p>TF-IDF In a large text corpus, some words will be very present (e.g., \u201cthe\u201d, \u201ca\u201d, \u201cis\u201d in English), hence carrying very little meaningful information about the actual contents of the document. If we were to feed the direct count data directly to a classifier, those very frequent terms would shadow the frequencies of rarer, yet more interesting, terms. Use the TF-IDF strategy to re-weight the count features into floating point values. The created columns are named with the words they are embedding with the prefix <code>corpus_</code>. Read more in sklearn's documentation.</p> <p></p> <p>Hashing The larger the corpus, the larger the vocabulary will grow and thus increasing the number of features and memory use. Use the Hashing strategy to hash the words to a specified number of features. The created features are named <code>hash0</code>, <code>hash1</code>, etc... Read more in sklearn's documentation.</p>"}, {"location": "user_guide/nomenclature/", "title": "Nomenclature", "text": "<p>This documentation consistently uses terms to refer to certain concepts related to this package. The most frequent terms are described hereunder.</p> <p></p> ATOM <p>Refers to this package.</p> atom <p>Instance of the ATOMClassifier, ATOMForecaster or ATOMRegressor classes (note that the examples use it as the default variable name).</p> <p>A pipeline, corresponding dataset and models fitted to that dataset. See the branches section of the user guide.</p> categorical columns <p>Refers to all columns of type <code>object</code> or <code>category</code>.</p> class <p>Unique value in a column, e.g., a binary classifier has 2 classes in the target column.</p> dataframe <p>Two-dimensional, size-mutable, potentially heterogeneous tabular data of type pd.DataFrame or its modin counterpart.</p> dataframe-like <p>Any type object from which a dataframe can be created. This includes an iterable, a dict whose values are 1d-arrays, a two-dimensional list, tuple, np.ndarray or sps.csr_matrix, and most commonly, a dataframe. This is the standard input format for any dataset.</p> <p>Additionally, you can provide a callable whose output is any of the aforementioned types. This is useful when the dataset is very large and you are performing parallel operations, since it can avoid broadcasting a large dataset from the driver to the workers.</p> estimator <p>An object which manages the estimation and decoding of an algorithm. The algorithm is estimated as a deterministic function of a set of parameters, a dataset and a random state. Should implement a <code>fit</code> method. Often used interchangeably with predictor because of user preference.</p> index <p>Immutable sequence used for indexing and alignment of type pd.Index or their modin counterpart.</p> missing values <p>All values in the <code>missing</code> attribute, as well as <code>None</code>, <code>NaN</code>, <code>+inf</code> and <code>-inf</code>.</p> model <p>Instance of a model in atom. Not to confuse with estimator.</p> outliers <p>Sample that contains one or more outlier values. Note that the Pruner class can use a different definition for outliers depending on the chosen strategy.</p> outlier value <p>Value that lies further than 3 times the standard deviation away from the mean of its column, i.e., |z-score| &gt; 3.</p> predictor <p>An estimator implementing a <code>predict</code> method.</p> scorer <p>A non-estimator callable object which evaluates an estimator on given test data, returning a number. Unlike evaluation metrics, a greater returned number must correspond with a better score. See sklearn's documentation.</p> segment <p>Subset (segment) of a sequence, whether through slicing or generating a range of values. When given as a parameter type, it includes both range and slice.</p> sequence <p>A one-dimensional, indexable array of type sequence (except string), np.ndarray, index or series. This is the standard input format for a dataset's target column.</p> series <p>One-dimensional ndarray with axis labels of type pd.Series or its modin counterpart.</p> target <p>The dependent variable in a supervised learning task. Passed as <code>y</code> to an estimator's fit method.</p> task <p>One of the supervised machine learning approaches that ATOM supports:</p> <ul> <li>binary classification</li> <li>multiclass classification</li> <li>multilabel classification</li> <li>multiclass-multioutput classification</li> <li>regression</li> <li>multioutput regression</li> <li>univariate forecast</li> <li>multivariate forecast</li> </ul> transformer <p>An estimator implementing a <code>transform</code> method. This encompasses all data cleaning and feature engineering classes.</p>"}, {"location": "user_guide/plots/", "title": "Plots", "text": "<p>ATOM provides many plotting methods to analyze the data or compare the model performances. Descriptions and examples can be found in the API section. ATOM mainly uses the plotly library for plotting. Plotly makes interactive, publication-quality graphs that are rendered using html. Some plots require other libraries like matplotlib, shap, wordcloud and schemdraw.</p> <p>Plots that compare model performances (methods with the <code>models</code> parameter) can be called directly from atom, e.g., <code>atom.plot_roc()</code>, or from one of the models, e.g., <code>atom.adab.plot_roc()</code>. If called from atom, use the <code>models</code> parameter to specify which models to plot. If called from a specific model, it makes the plot only for that model and the <code>models</code> parameter becomes unavailable.</p> <p>Plots that analyze the data (methods without the <code>models</code> parameter) can only be called from atom, and not from the models.</p> <p></p>"}, {"location": "user_guide/plots/#parameters", "title": "Parameters", "text": "<p>Apart from the plot-specific parameters, all plots have five parameters in common:</p> <ul> <li>The <code>title</code> parameter adds a title to the plot. The default value doesn't   show any title. Provide a configuration (as dictionary) to customize its   appearance, e.g., <code>title=dict(text=\"Awesome plot\", color=\"red\")</code>.   Read more in plotly's documentation.</li> <li> <p>The <code>legend</code> parameter is used to show/hide, position or customize the   plot's legend. Provide a configuration (as dictionary) to customize its   appearance (e.g., <code>legend=dict(title=\"Title for legend\", title_font_color=\"red\")</code>)   or choose one of the following locations:</p> <ul> <li>upper left</li> <li>upper right</li> <li>lower left</li> <li>lower right</li> <li>upper center</li> <li>lower center</li> <li>center left</li> <li>center right</li> <li>center</li> <li>out: Position the legend outside the axis, on the right hand side. This   is plotly's default position. Note that this shrinks the size of the axis   to fit both legend and axes in the specified <code>figsize</code>.</li> </ul> </li> <li> <p>The <code>figsize</code> parameter adjust the plot's size.</p> </li> <li>The <code>filename</code> parameter is used to save the plot.</li> <li>The <code>display</code> parameter determines whether to show or return the plot.</li> </ul> <p>Info</p> <p>In some plotting methods, it's possible to plot separate lines for different subsets of the rows. For example, to compare the results on the train and test set. For these cases, either provide a sequence to the <code>rows</code> parameter for every line you want to draw, e.g., <code>atom.plot_roc(rows=(\"train\", \"test\"))</code>, or provide a dictionary where the keys are the names of the sets (used in the legend) and the values are the corresponding selection of rows, selected using any of the aforementioned approaches, e.g, <code>atom.plot_roc(rows={\"0-99\": range(100), \"100-199\": range(100, 200})</code>. Note that for these methods, using <code>atom.plot_roc(rows=\"train+test\")</code>, only plots one line with the data from both sets. See the advanced plotting example.</p> <p></p>"}, {"location": "user_guide/plots/#aesthetics", "title": "Aesthetics", "text": "<p>The plot's aesthetics can be customized using the plot attributes prior to calling the plotting method, e.g., <code>atom.title_fontsize = 30</code>. The default values are:</p> <ul> <li>palette: [\"rgb(0, 98, 98)\", \"rgb(56, 166, 165)\", \"rgb(115, 175, 72)\",   \"rgb(237, 173, 8)\", \"rgb(225, 124, 5)\", \"rgb(204, 80, 62)\", \"rgb(148, 52, 110)\",   \"rgb(111, 64, 112)\", \"rgb(102, 102, 102)\"]</li> <li>title_fontsize: 24</li> <li>label_fontsize: 16</li> <li>tick_fontsize: 12</li> </ul> <p>Use atom's update_layout method to further customize the plot's layout using any of plotly's layout properties, e.g., <code>atom.update_layout(template=\"plotly_dark\")</code>. Similarly, use the update_traces method to customize the traces properties, e.g. <code>atom.update_traces(mode=\"lines+markers\")</code>.</p> <p>The reset_aesthetics method allows you to reset all aesthetics to their default value. See advanced plotting for an example.</p> <p></p>"}, {"location": "user_guide/plots/#canvas", "title": "Canvas", "text": "<p>Use the canvas method to draw multiple plots side by side, for example to make it easier to compare similar results. The canvas method is a <code>@contextmanager</code>, i.e., it's used through Python's <code>with</code> command. Plots in a canvas ignore the legend, figsize, filename and display parameters. Instead, specify these parameters in the canvas. If a variable is assigned to the canvas (e.g., <code>with atom.canvas() as fig</code>), it yields the resulting figure.</p> <p>For example, we can use a canvas to compare the results of a XGBoost and LightGBM model on the train and test set. We could also draw the lines for both models in the same axes, but that would clutter the plot too much. Click here for more examples.</p> <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y)\n&gt;&gt;&gt; atom.run([\"XGB\", \"LGB\"])\n\n&gt;&gt;&gt; with atom.canvas(2, 2, title=\"XGBoost vs LightGBM\"):\n...     atom.xgb.plot_roc(rows=\"train+test\", title=\"ROC - XGBoost\")\n...     atom.lgb.plot_roc(rows=\"train+test\", title=\"ROC - LightGBM\")\n...     atom.xgb.plot_prc(rows=\"train+test\", title=\"PRC - XGBoost\")\n...     atom.lgb.plot_prc(rows=\"train+test\", title=\"PRC - LightGBM\")\n</code></pre> <p></p>"}, {"location": "user_guide/plots/#shap", "title": "SHAP", "text": "<p>The SHAP (SHapley Additive exPlanations) python package uses a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions. ATOM implements methods to plot 7 of SHAP's plotting functions directly from its API. A list of available shap plots can be found here.</p> <p>Calculating the Shapley values is computationally expensive, especially for model agnostic explainers like Permutation. To avoid having to recalculate the values for every plot, ATOM stores the shapley values internally after the first calculation, and access them later when needed again.</p> <p>Note</p> <p>Since the plot figures are not made by ATOM, note the following:</p> <ul> <li>It's not possible to draw multiple models in the same figure.   Selecting more than one model will raise an exception. To avoid   this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_force()</code>.</li> <li>The returned plot is a matplotlib figure, not plotly's.</li> </ul> <p></p>"}, {"location": "user_guide/plots/#available-plots", "title": "Available plots", "text": "<p>A list of available plots can be found hereunder. Note that not all plots can be called from every class and that their availability can depend on the task at hand.</p>"}, {"location": "user_guide/plots/#data-plots", "title": "Data plots", "text": "<p>plot_componentsPlot the explained variance ratio per component.plot_correlationPlot a correlation matrix.plot_distributionPlot column distributions.plot_ngramsPlot n-gram frequencies.plot_pcaPlot the explained variance ratio vs number of components.plot_qqPlot a quantile-quantile plot.plot_relationshipsPlot pairwise relationships in a dataset.plot_rfecvPlot the rfecv results.plot_wordcloudPlot a wordcloud from the corpus.</p>"}, {"location": "user_guide/plots/#hyperparameter-tuning-plots", "title": "Hyperparameter tuning plots", "text": "<p>plot_edfPlot the Empirical Distribution Function of a study.plot_hyperparameter_importancePlot a model's hyperparameter importance.plot_hyperparametersPlot hyperparameter relationships in a study.plot_parallel_coordinatePlot high-dimensional parameter relationships in a study.plot_pareto_frontPlot the Pareto front of a study.plot_slicePlot the parameter relationship in a study.plot_terminator_improvementPlot the potentials for future objective improvement.plot_timelinePlot the timeline of a study.plot_trialsPlot the hyperparameter tuning trials.</p>"}, {"location": "user_guide/plots/#prediction-plots", "title": "Prediction plots", "text": "<p>plot_calibrationPlot the calibration curve for a binary classifier.plot_confusion_matrixPlot a model's confusion matrix.plot_detPlot the Detection Error Tradeoff curve.plot_errorsPlot a model's prediction errors.plot_evalsPlot evaluation curves.plot_feature_importancePlot a model's feature importance.plot_forecastPlot a time series with model forecasts.plot_gainsPlot the cumulative gains curve.plot_learning_curvePlot the learning curve: score vs number of training samples.plot_liftPlot the lift curve.plot_parshapPlot the partial correlation of shap values.plot_partial_dependencePlot the partial dependence of features.plot_permutation_importancePlot the feature permutation importance of models.plot_pipelinePlot a diagram of the pipeline.plot_prcPlot the precision-recall curve.plot_probabilitiesPlot the probability distribution of the target classes.plot_residualsPlot a model's residuals.plot_resultsPlot the model results.plot_rocPlot the Receiver Operating Characteristics curve.plot_successive_halvingPlot scores per iteration of the successive halving.plot_thresholdPlot metric performances against threshold values.</p>"}, {"location": "user_guide/plots/#shap-plots", "title": "Shap plots", "text": "<p>plot_shap_barPlot SHAP's bar plot.plot_shap_beeswarmPlot SHAP's beeswarm plot.plot_shap_decisionPlot SHAP's decision plot.plot_shap_forcePlot SHAP's force plot.plot_shap_heatmapPlot SHAP's heatmap plot.plot_shap_scatterPlot SHAP's scatter plot.plot_shap_waterfallPlot SHAP's waterfall plot.</p>"}, {"location": "user_guide/predicting/", "title": "Predicting", "text": "<p>After training a model, you probably want to make predictions on new, unseen data. Just like a sklearn estimator, you can call the prediction methods from the model, e.g., <code>atom.tree.predict(X)</code>.</p> <p>All prediction methods transform the provided data through the pipeline in the model's branch before making the predictions. Transformers that should only be applied on the training set are excluded from this step (e.g., outlier pruning or class balancing).</p> <p>The available prediction methods are the standard methods for estimators in sklearn's and sktime's API.</p> <p>For classification and regression tasks:</p> <p>decision_functionGet confidence scores on new data or existing rows.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.scoreGet a metric score on new data.</p> <p>For forecast tasks:</p> <p>predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.scoreGet a metric score on new data.</p> <p>Warning</p> <p>The <code>score</code> method return atom's metric score, not the metric returned by sklearn/sktime's score method for estimators. Use the method's <code>metric</code> parameter to calculate a different metric.</p> <p>Note</p> <ul> <li>The output of ATOM's methods are pandas objects, not numpy arrays.</li> <li>The <code>predict_proba</code> method of some meta-estimators for multioutput tasks   (such as MultioutputClassifier) return 3 dimensions, namely, a list of   arrays with shape=(n_samples, n_classes). One array per target column. Since   ATOM's prediction methods return pandas objects, such 3-dimensional arrays   are converted to a multiindex pd.DataFrame, where the first level of the row   indices are the target columns, and the second level are the classes.</li> <li>The prediction results are cached after the first call to avoid consequent   expensive calculations. This mechanism can increase the size of the instance   for large datasets. Use the clear method to free the   memory.</li> </ul> <p>It's also possible to get the prediction for a specific row or rows in the dataset. See the row and column selection section in the user guide to learn how to select the rows, e.g., <code>atom.rf.predict(\"test\")</code> or <code>atom.rf.predict_proba(range(100))</code>.</p> <p>Note</p> <p>For forecast models, prediction on rows follow the ForecastingHorizon API. That means that using the row index works, but for example using <code>atom.arima.predict(1)</code> returns the prediction on the first row of the test set (instead of the second row of the train set).</p>"}, {"location": "user_guide/time_series/", "title": "Time series", "text": ""}, {"location": "user_guide/time_series/#forecast", "title": "Forecast", "text": ""}, {"location": "user_guide/time_series/#time-series-classification", "title": "Time series classification", "text": ""}, {"location": "user_guide/time_series/#time-series-regression", "title": "Time series regression", "text": ""}, {"location": "user_guide/training/", "title": "Training", "text": "<p>The training phase is where the models are fitted on the training data. After this, you can use the plots and prediction methods to evaluate the results. The training applies the following steps for all models:</p> <ol> <li>Use hyperparameter tuning to select the optimal hyperparameters for     the model (optional).</li> <li>The model is fitted on the training set using the best combination    of hyperparameters found. After that, the model is evaluated on the tes set.</li> <li>Calculate various scores on the test set using a bootstrap    algorithm (optional).</li> </ol> <p>There are three approaches to run the training.</p> <ul> <li>Direct training:<ul> <li>DirectClassifier</li> <li>DirectForecaster</li> <li>DirectRegressor</li> </ul> </li> <li>Training via successive halving:<ul> <li>SuccessiveHalvingClassifier</li> <li>SuccessiveHalvingForecaster</li> <li>SuccessiveHalvingRegressor</li> </ul> </li> <li>Training via train sizing:<ul> <li>TrainSizingClassifier</li> <li>TrainSizingForecaster</li> <li>TrainSizingRegressor</li> </ul> </li> </ul> <p>The direct fashion repeats the aforementioned steps only once, while the other two approaches repeats them more than once. Just like the data cleaning and feature engineering classes, it's discouraged to use these classes directly. Instead, every approach can be called directly from atom through the run, successive_halving and train_sizing methods respectively.</p> <p>Models are called through their acronyms, e.g., <code>atom.run(models=\"RF\")</code> will train a RandomForest. If you want to run the same model multiple times, add a tag after the acronym to differentiate them. the tag must be  separated from the accronym by an underscore.</p> <pre><code>atom.run(\n    models=[\"RF_1\", \"RF_2\"],\n    est_params={\n        \"RF_1\": {\"n_estimators\": 100},\n        \"RF_2\": {\"n_estimators\": 200},\n    }\n)\n</code></pre> <p>For example, this pipeline fits two Random Forest models, one with 100 and the other with 200 decision trees. The models can be accessed through <code>atom.rf_1</code> and <code>atom.rf_2</code>. Use tagged models to test how the same model performs when fitted with different parameters or on different data sets. See the Imbalanced datasets example.</p> <p>Additional things to take into account:</p> <ul> <li>If an exception is encountered while fitting an estimator, the   pipeline will automatically jump to the next model. The exceptions are   stored in the <code>errors</code> attribute. Note that when a model is skipped,   there is no model subclass for that estimator.</li> <li>When showing the final results, a <code>!</code> indicates the highest score   and a <code>~</code> indicates that the model is possibly overfitting (training   set has a score at least 20% higher than the test set).</li> </ul> <p></p>"}, {"location": "user_guide/training/#metric", "title": "Metric", "text": "<p>ATOM uses sklearn's scorers for model evaluation. A scorer consists of a metric function and some parameters that define the scorer's properties , such as if a higher or lower score is better (score or loss function) or if the function needs probability estimates or rounded predictions (see the make_scorer function). The <code>metric</code> parameter accepts three ways of defining the scorer:</p> <ul> <li>Using the name of one of the predefined scorers.</li> <li>Using a function with signature <code>function(y_true, y_pred) -&gt; score</code>.   In this case, ATOM uses make_scorer   with default parameters.</li> <li>Using a scorer object.</li> </ul> <p>Note that all scorers follow the convention that higher return values are better than lower return values. Thus, metrics which measure the distance between the model and the data (i.e., loss functions), like <code>max_error</code> or <code>mean_squared_error</code>, will return the negated value of the metric.</p> <p></p>"}, {"location": "user_guide/training/#predefined-scorers", "title": "Predefined scorers", "text": "<p>ATOM accepts all sklearn's scorers as well as some custom acronyms and custom scorers. Since some of sklearn's scorers have quite long names and ATOM is all about lazyfast experimentation, the package provides acronyms for some of the most commonly used ones. These acronyms are case-insensitive and can be used in the <code>metric</code> parameter instead of the scorer's full name, e.g., <code>atom.run(\"LR\", metric=\"BA\")</code> uses <code>balanced_accuracy</code>. The available acronyms are:</p> <ul> <li>\"AP\" for \"average_precision\"</li> <li>\"BA\" for \"balanced_accuracy\"</li> <li>\"AUC\" for \"roc_auc\"</li> <li>\"LogLoss\" for \"neg_log_loss\"</li> <li>\"EV\" for \"explained_variance\"</li> <li>\"ME\" for \"max_error\"</li> <li>\"MAE\" for \"neg_mean_absolute_error\"</li> <li>\"MSE\" for \"neg_mean_squared_error\"</li> <li>\"RMSE\" for \"neg_root_mean_squared_error\"</li> <li>\"MSLE\" for \"neg_mean_squared_log_error\"</li> <li>\"MEDAE\" for \"neg_median_absolute_error\"</li> <li>\"MAPE\" for \"neg_mean_absolute_percentage_error\"</li> <li>\"POISSON\" for \"neg_mean_poisson_deviance\"</li> <li>\"GAMMA\" for \"neg_mean_gamma_deviance\"</li> </ul> <p>ATOM also provides some extra common metrics for binary classification tasks. </p> <ul> <li>\"TN\" for True Negatives</li> <li>\"FP\" for False Positives</li> <li>\"FN\" for False Negatives</li> <li>\"TP\" for True Positives</li> <li>\"FPR\" for False Positive rate (fall-out)</li> <li>\"TPR\" for True Positive Rate (sensitivity, recall)</li> <li>\"TNR\" for True Negative Rate (specificity)</li> <li>\"FNR\" for False Negative Rate (miss rate)</li> <li>\"MCC\" for Matthews Correlation Coefficient (also for multiclass classification)</li> </ul> <p></p>"}, {"location": "user_guide/training/#multi-metric-runs", "title": "Multi-metric runs", "text": "<p>Sometimes it is useful to measure the performance of the models in more than one way. ATOM lets you run the pipeline with multiple metrics at the same time. To do so, provide the <code>metric</code> parameter with a list of desired metrics, e.g., <code>atom.run(\"LDA\", metric=[\"r2\", \"mse\"])</code>.</p> <p>When fitting multi-metric runs, the resulting scores will return a list of metrics. For example, if you provided three metrics to the pipeline, <code>atom.knn.score_train</code> could return [0.8734, 0.6672, 0.9001]. Only the first metric of a multi-metric run (this metric is called the main metric) is used to select the winning model.</p> <p>Info</p> <ul> <li>The <code>winning</code> model is retrieved comparing only   the main metric.</li> <li>Some plots let you choose which of the metrics in a multi-metric run   to show using the <code>metric</code> parameter, e.g., plot_results.</li> </ul> <p></p>"}, {"location": "user_guide/training/#automated-feature-scaling", "title": "Automated feature scaling", "text": "<p>Models that require feature scaling will automatically do so before training, unless the data is sparse or already scaled. The data is considered scaled if it has one of the following prerequisites:</p> <ul> <li>The mean value over the mean of all columns lies between -0.05 and 0.05   and the mean of the standard deviation over all columns lies between 0.85   and 1.15. Categorical and binary columns (only 0s and 1s) are excluded   from the calculation.</li> <li>There is a transformer in the pipeline whose __name__ contains the   word <code>scaler</code>.</li> </ul> <p>The scaling is applied using a Scaler with default parameters. It can be accessed from the model through the <code>scaler</code> attribute. The scaled dataset can be examined through the model's data attributes. Use the available_models method to see which models require feature scaling. See here an example.</p> <p></p>"}, {"location": "user_guide/training/#in-training-validation", "title": "In-training validation", "text": "<p>Some predefined models allow in-training validation. This means that the estimator is evaluated (using only the main metric) on the train and test set after every round of the training (a round can be an iteration for linear models or an added tree for boosted tree models). The validation scores are stored in the <code>evals</code> attribute, a dictionary of the train and test performances per round (also when pruning isn't applied). Click here for an example using in-training validation.</p> <p>The predefined models that support in-training validation are:</p> <ul> <li>CatBoost</li> <li>LightGBM</li> <li>MultiLayerPerceptron</li> <li>PassiveAggressive</li> <li>Perceptron</li> <li>StochasticGradientDescent</li> <li>XGBoost</li> </ul> <p>To apply in-training validation to a custom model, use the <code>has_validation</code> parameter when creating the custom model.</p> <p>Warning</p> <ul> <li>In-training validation is not calculated during hyperparameter tuning.</li> <li>CatBoost selects the weights achieved by the best evaluation on the test set after training. This means that, by default, there is some minor data leakage in the test set. Use the <code>use_best_model=False</code> parameter to avoid this behavior or use a holdout set to evaluate the final estimator.</li> </ul> <p>Tip</p> <p>Use the plot_evals method to visualize the in-training validation on the train and test sets.</p> <p></p>"}, {"location": "user_guide/training/#parameter-customization", "title": "Parameter customization", "text": "<p>By default, every estimator uses the default parameters they get from their respective packages. To select different ones, use the <code>est_params</code>. parameter of the run method. There are two ways to add custom parameters to the models: adding them directly to the dictionary as key-value pairs or through dictionaries.</p> <p>Adding the parameters directly to <code>est_params</code> (or using a dict with the key 'all') shares them across all models in the trainer. In this example, both the XGBoost and the LightGBM model use 200 boosted trees. Make sure all the models do have the specified parameters or an exception will be raised!</p> <pre><code>atom.run(models=[\"XGB\", \"LGB\"], est_params={\"n_estimators\": 200})\n</code></pre> <p>To specify parameters per model, use the model name as key and a dict of the parameters as value. In this example, the XGBoost model uses <code>n_estimators=200</code> and the MultiLayerPerceptron uses one hidden layer with 75 neurons.</p> <pre><code>atom.run(\n    models=[\"XGB\", \"MLP\"],\n    est_params={\n        \"XGB\": {\"n_estimators\": 200},\n        \"MLP\": {\"hidden_layer_sizes\": (75,)},\n    }\n)\n</code></pre> <p>Some estimators allow you to pass extra parameters to the fit method (besides X and y). This can be done adding <code>_fit</code> at the end of the parameter. For example, to change XGBoost's verbosity, we can run:</p> <pre><code>atom.run(models=\"XGB\", est_params={\"verbose_fit\": True})\n</code></pre> <p>Note</p> <p>If a parameter is specified through <code>est_params</code>, it's ignored by the study, even if it's added manually to <code>ht_params[\"distributions\"]</code>.</p> <p>Info</p> <p>The estimator's <code>n_jobs</code> and <code>random_state</code> parameters adopt atom's values (when available), unless specified through <code>est_params</code>.</p> <p></p>"}, {"location": "user_guide/training/#hyperparameter-tuning", "title": "Hyperparameter tuning", "text": "<p>In order to achieve maximum performance, it's important to tune an estimator's hyperparameters before training it. ATOM provides hyperparameter tuning through the optuna package. Just like optuna, we use the terms <code>study</code> and <code>trial</code> as follows:</p> <ul> <li>Study: optimization based on an objective function.</li> <li>Trial: a single execution of the objective function.</li> </ul> <p>Each trial is either computed by cross-validation on the complete training set or by randomly splitting the training set every iteration into a (sub)training and validation set. This process can create some minimum data leakage towards specific parameters (since the estimator is evaluated on data that is used to train the next estimator), but it ensures maximal use of the provided data. However, the leakage is not present in the independent test set, thus the final score of every model is unbiased. Note that, if the dataset is relatively small, the tuning's best score can consistently be lower than the final score on the test set due to the considerable lower fraction of instances on which it is trained. After finishing the study, the parameters that resulted in the best score are used to fit the final model on the complete training set.</p> <p>Info</p> <ul> <li>Unless specified differently by the user, the used samplers   are TPESampler   for single-metric runs and NSGAIISampler   for multi-metric runs.</li> <li>For multi-metric runs, the selected best trial   is the trial that performed best on the main metric. Use the property's   <code>@setter</code> to change it to any other trial. See the hyperparameter tuning   example.</li> </ul> <p>There are many possibilities to tune the study to your liking. The main parameter is <code>n_trials</code>, which determine the number of trials that are performed.</p> <p>Extra things to take into account:</p> <ul> <li>The train/validation splits are different per trial but equal for all models.</li> <li>Re-evaluating the objective function at the same point (with the same   hyperparameters) automatically skips the calculation and returns the   same score as the equivalent trial.</li> </ul> <p>Tip</p> <p>The hyperparameter tuning output can become quite wide for models with many hyperparameters. If you are working in a Jupyter Notebook, you can change the output's width running the following code in a cell: <pre><code>from IPython.display import display, HTML\ndisplay(HTML(\"&lt;style&gt;.container { width:100% !important; }&lt;/style&gt;\"))\n</code></pre></p> <p>Other settings can be changed through the <code>ht_params</code> parameter, a dictionary where every key-value combination can be used to further customize the optimization.</p> <p>By default, which hyperparameters are tuned and their corresponding distributions are predefined by ATOM. Use the 'distributions' key to customize these. Just like with <code>est_params</code>, it's possible to share the same parameters across models or use a dictionary with the model name as key to specify the parameters for every individual model. Use the key 'all' to tune some hyperparameters for all models when you also want to tune other parameters only for specific ones. The following example tunes the <code>n_estimators</code> parameter for both models but the <code>max_depth</code> parameter only for the RandomForest.</p> <pre><code>atom.run(\n    models=[\"ET\", \"RF\"],\n    n_trials=30,\n    ht_params={\"distributions\": {\"all\": \"n_estimators\", \"RF\": \"max_depth\"}},\n)\n</code></pre> <p>Like the <code>columns</code> parameter in atom's methods, you can exclude parameters from the optimization adding <code>!</code> before its name. It's possible to exclude multiple parameters, but not to combine inclusion and exclusion for the same model. For example, to optimize a RandomForest using all its predefined parameters except <code>n_estimators</code>, run:</p> <pre><code>atom.run(\n    models=\"ET\",\n    n_trials=15,\n    ht_params={\"distributions\": \"!n_estimators\"},\n)\n</code></pre> <p>If just the parameter name is provided, the predefined distribution is used. It's also possible to provide custom distributions spaces, but make sure they are compliant with optuna's API. See every model's individual documentation in ATOM's API section for an overview of their hyperparameters and distributions.</p> <pre><code>from optuna.distributions import (\n    IntDistribution, FloatDistribution, CategoricalDistribution\n)\n\natom.run(\n    models=[\"ET\", \"RF\"],\n    n_trials=30,\n    ht_params={\n        \"dimensions\": {\n            \"all\": {\"n_estimators\": IntDistribution(10, 100, step=10)},\n            \"RF\": {\n                \"max_depth\": IntDistribution(1, 10),\n                \"max_features\": CategoricalDistribution([\"sqrt\", \"log2\"]),\n           },\n        },\n    }\n)\n</code></pre> <p>Parameters for optuna's study and the study's optimize method can be added as kwargs to <code>ht_params</code>. For example, to use a different sampler or add a custom callback.</p> <pre><code>from optuna.samplers import RandomSampler\n\natom.run(\n    models=\"LR\",\n    n_trials=30,\n    ht_params={\n        \"sampler\": RandomSampler(seed=atom.random_state),\n        \"callbacks\": custom_callback(),\n    },\n)\n</code></pre> <p>Note</p> <ul> <li>If you use the default sampler, it\u2019s recommended to consider setting   larger <code>n_trials</code> to make full use of the characteristics of TPESampler   because TPESampler uses some (by default, 10) trials for its startup.</li> <li>When specifying distributions manually, make sure to import the   distribution types from optuna: <code>from optuna.distributions import ...</code>.</li> </ul> <p>Warning</p> <p>Keras' models can only use hyperparameter tuning when <code>n_jobs=1</code> or <code>ht_params={\"cv\": 1}</code>. Using n_jobs &gt; 1 and cv &gt; 1 raises a PicklingError due to incompatibilities of the APIs. Read here more about deep learning models.</p> <p>Tip</p> <p>ATOM has several plots that can help you examine a model's study and trials. Have a look at them here.</p> <p></p>"}, {"location": "user_guide/training/#pruning", "title": "Pruning", "text": "<p>During hyperparameter tuning, pruning stops unpromising trials at the early stages of the training (a.k.a., automated early-stopping). This can save the pipeline much time that would otherwise be wasted on an estimator that is unlikely to yield the best results. A pruned trial can't be selected as <code>best_trial</code>. Click here to see an example that uses pruning.</p> <p>The study uses MedianPruner as default pruner. You can use any other of optuna's pruners through the <code>ht_params</code> parameter.</p> <pre><code>from optuna.pruners import HyperbandPruner\n\natom.run(\"SGD\", n_trials=30, ht_params={\"pruner\": HyperbandPruner()})\n</code></pre> <p>Warning</p> <ul> <li>Pruning is disabled for multi-metric runs.</li> <li>Pruning is only available for models that support in-training validation.</li> </ul> <p></p>"}, {"location": "user_guide/training/#bootstrapping", "title": "Bootstrapping", "text": "<p>After fitting the estimator, you can assess the robustness of the model using the bootstrap technique. This technique creates several new data sets selecting random  samples from the training set (with replacement) and evaluates them on  the test set. This way you can get a distribution of the performance of the model. The sets are the same for every model. The number of sets can be chosen through the <code>n_bootstrap</code> parameter.</p> <p>Tip</p> <p>Use the plot_results method to plot the boostrap scores in a boxplot.</p> <p></p>"}, {"location": "user_guide/training/#successive-halving", "title": "Successive halving", "text": "<p>Successive halving is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, we recommend only to use this technique with similar models, e.g., only using tree-based models.</p> <p>Run successive halving from atom via the successive_halving method. Consecutive runs of the same model are saved with the model's acronym followed by the number of models in the run. For example, a RandomForest in a run with 4 models would become model <code>RF4</code>.</p> <p>See here a successive halving example.</p> <p>Tip</p> <p>Use the plot_successive_halving method to see every model's performance per iteration of the successive halving.</p> <p></p>"}, {"location": "user_guide/training/#train-sizing", "title": "Train sizing", "text": "<p>When training models, there is usually a trade-off between model performance and computation time, that is regulated by the number of samples in the training set. Train sizing can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.</p> <p>Run train sizing from atom via the train_sizing method. The number of iterations and the number of samples per training can be specified with the <code>train_sizes</code> parameter. Consecutive runs of the same model are saved with the model's acronym followed by the fraction of rows in the training set (the <code>.</code> is removed from the fraction!). For example, a RandomForest in a run with 80% of the training samples would become model <code>RF08</code>.</p> <p>See here a train sizing example.</p> <p>Tip</p> <p>Use the plot_learning_curve method to see the model's performance per size of the training set.</p>"}]}
+{"config": {"lang": ["en"], "separator": "[\\s\\-]+", "pipeline": ["stopWordFilter"]}, "docs": [{"location": "about/", "title": "About", "text": ""}, {"location": "about/#what-is-it", "title": "What is it?", "text": "<p>Automated Tool for Optimized Modeling (ATOM) is an open-source Python package designed to help data scientists fasten up the exploration phase of their machine learning projects. ATOM is a low-code, easy-to-use library, capable of running experiments quickly and efficiently, enabling the user to go from raw data to generating insights in just a few lines of code. Click here to get started.</p> <p></p>"}, {"location": "about/#what-can-i-do-with-it", "title": "What can I do with it?", "text": "<p>ATOM is an end-to-end solution for machine learning pipelines. It supports the user from raw data ingestion to the final results' analysis and model deployment. Click on the icons to read more about its main functionalities.</p> Data cleaning Feature engineering Model selection Hyperparametertuning Model training Model predictions Experiment logging Analysis &amp;Interpretability"}, {"location": "about/#who-is-it-intended-for", "title": "Who is it intended for?", "text": "<ul> <li>Data scientists that want to fasten up the exploration phase of their machine   learning projects.</li> <li>Data scientists that want to run a simple modeling experiment without having   to spend too much time on coding.</li> <li>Data scientists that are new to Python and are not (yet) familiar with all   the relevant machine learning packages.</li> <li>Data analysts without extensive knowledge of machine learning that want to   try out model-based solutions.</li> <li>Anyone who wants to rapidly build a Proof of Concept, for example during a hackathon.</li> <li>Anyone who is new to the field of machine learning and wants a low-code,   easy to learn package, to get started building predictive pipelines.</li> </ul>"}, {"location": "about/#citing-atom", "title": "Citing ATOM", "text": "<p>If you use ATOM in a scientific publication, please consider citing this documentation page as the resource. ATOM\u2019s first stable release v2.0.3 was made publicly available in November 2019. A formatted version of the citation would look like this:</p> <p>ATOM v2.0.3, November 2019. URL https://tvdboom.github.io/ATOM/</p> <p>BibTeX entry:</p> <pre><code>@Manual{ATOM,\n    title = {ATOM: A Python package for fast exploration of machine learning pipelines},\n    author = {Mavs},\n    year={2019},\n    mont={November},\n    note = {ATOM version 2.0.3},\n    url = {https://tvdboom.github.io/ATOM/},\n}\n</code></pre> <p></p>"}, {"location": "about/#support", "title": "Support", "text": "<p>ATOM recognizes the support from JetBrains by providing core project contributors with a set of developer tools free of charge.</p> <p> </p> <p></p>"}, {"location": "about/#integrations", "title": "Integrations", "text": ""}, {"location": "contributing/", "title": "Contributing", "text": "<p>Are you interested in contributing to ATOM? Do you want to report a bug? Do you have a question? Before you do, please read the following guidelines.</p> <p></p>"}, {"location": "contributing/#submission-context", "title": "Submission context", "text": ""}, {"location": "contributing/#question-or-problem", "title": "Question or problem?", "text": "<p>For quick questions, there's no need to open an issue. Check first if the question isn't already answered in the FAQ section. If not, reach us through the discussions page or on the slack channel.</p>"}, {"location": "contributing/#report-a-bug", "title": "Report a bug?", "text": "<p>If you found a bug in the source code, you can help by submitting an issue to the issue tracker in the GitHub repository. Even better, you can submit a Pull Request with a fix. However, before doing so, please read the submission guidelines.</p>"}, {"location": "contributing/#missing-a-feature", "title": "Missing a feature?", "text": "<p>You can request a new feature by submitting an issue to the GitHub Repository. If you would like to implement a new feature, please submit an issue with a proposal for your work first. Please consider what kind of change it is:</p> <ul> <li> <p>For a major feature, first open an issue and outline your proposal so   that it can be discussed. This will also allow us to better coordinate our   efforts, prevent duplication of work, and help you to craft the change so   that it is successfully accepted into the project.</p> </li> <li> <p>Small features and bugs can be crafted and directly submitted as a Pull   Request. However, there is no guarantee that your feature will make it into   <code>master</code>, as it's always a matter of opinion whether if benefits the   overall functionality of the project.</p> </li> </ul>"}, {"location": "contributing/#project-layout", "title": "Project layout", "text": "<p>The latest stable release of ATOM is on the <code>master</code> branch, whereas the latest version of ATOM in development is on the <code>development</code> branch. Make sure you are looking at and working on the correct branch if you're looking to contribute code.</p> <p>In terms of directory structure:</p> <ul> <li>All of ATOM's code sources are in the <code>atom</code> directory.</li> <li>The documentation sources are in the <code>docs_sources</code> directory.</li> <li>Images in the documentation are in the <code>docs_sources/img</code> directory.</li> <li>Tutorial notebooks are in the <code>examples</code> directory. If you want to   include the example to the documentation as well, add the <code>.ipynb</code> file   to <code>docs_sources/examples</code> and update the <code>mkdocs.yml</code> file accordingly.</li> <li>Unit tests are in the <code>tests</code> directory. Make sure to add the tests to the   file corresponding to the module in the <code>atom</code> directory with the code that   is being tested.</li> </ul> <p>Make sure to familiarize yourself with the project layout before making any major contributions, and especially make sure to send all code changes to the <code>development</code> branch.</p> <p></p>"}, {"location": "contributing/#submission-guidelines", "title": "Submission guidelines", "text": ""}, {"location": "contributing/#submitting-an-issue", "title": "Submitting an issue", "text": "<p>Before you submit an issue, please search the issue tracker, maybe an issue for your problem already exists, and the discussion might inform you of workarounds readily available.</p> <p>We want to fix all the issues as soon as possible, but before fixing a bug we need to reproduce and confirm it. In order to reproduce bugs we will systematically ask you to provide a minimal reproduction scenario using the custom issue template.</p>"}, {"location": "contributing/#submitting-a-pull-request", "title": "Submitting a pull request", "text": "<p>Before you submit a pull request, please work through this checklist to make sure that you have done the necessary so we can efficiently review and accept your changes.</p> <ul> <li>Update the documentation so all of your changes are reflected there.</li> <li>Adhere to PEP 8 standards.</li> <li>Use a maximum of 91 characters per line. Try to keep docstrings below   74 characters.</li> <li>Update the project unit tests to test your code changes as thoroughly   as possible.</li> <li>Make sure that your code is properly commented with docstrings and   comments explaining your rationale behind non-obvious coding practices.</li> <li>Run isort: <code>isort atom tests</code>.</li> <li>Run flake8: <code>flake8 --show-source --statistics atom tests</code>.</li> <li>Run pydocstyle: <code>pydocstyle atom tests</code>.</li> <li>Run mypy: <code>mypy atom tests</code>.</li> </ul> <p>If your contribution requires a new library dependency:</p> <ul> <li>Double-check that the new dependency is easy to install via pip and Anaconda.</li> <li>The library should support Python 3.10 and 3.11.</li> <li>Make sure the code works with the latest version of the library.</li> <li>Update the dependencies in the documentation.</li> <li>Add the library with the minimum required version to <code>pyproject.toml</code>.</li> </ul> <p>After submitting your pull request, GitHub will automatically run the tests on your changes and make sure that the updated code builds successfully. The checks run on Python 3.10 and 3.11, on Ubuntu and Windows. We also use services that automatically check code style and test coverage.</p>"}, {"location": "dependencies/", "title": "Dependencies", "text": ""}, {"location": "dependencies/#python-os", "title": "Python &amp; OS", "text": "<p>As of the moment, ATOM supports the following Python versions:</p> <ul> <li>Python 3.10</li> <li>Python 3.11</li> </ul> <p>And operating systems:</p> <ul> <li>Linux (Ubuntu, Fedora, etc...)</li> <li>Windows 8.1+</li> <li>macOS (not tested)</li> </ul> <p></p>"}, {"location": "dependencies/#packages", "title": "Packages", "text": ""}, {"location": "dependencies/#required", "title": "Required", "text": "<p>ATOM is built on top of several existing Python libraries. These packages are necessary for its correct functioning.</p> <ul> <li>beartype (&gt;=0.16.4)</li> <li>category-encoders (&gt;=2.6.3)</li> <li>dagshub (&gt;=0.3.8)</li> <li>dill (&gt;=0.3.6)</li> <li>gplearn (&gt;=0.4.2)</li> <li>imbalanced-learn (&gt;=0.11.0)</li> <li>ipython (&gt;=8.11.0)</li> <li>ipywidgets (&gt;=8.1.1)</li> <li>featuretools (&gt;=1.28.0)</li> <li>joblib (&gt;=1.3.1)</li> <li>matplotlib (&gt;=3.7.2)</li> <li>mlflow (&gt;=2.7.1)</li> <li>modin[ray] (&gt;=0.25.0)</li> <li>nltk (&gt;=3.8.1)</li> <li>numpy (&gt;=1.23.0)</li> <li>optuna (&gt;=3.4.0)</li> <li>pandas[parquet] (&gt;=2.1.2)</li> <li>plotly (&gt;=5.15.0)</li> <li>ray[serve] (&gt;=2.7.1)</li> <li>scikit-learn (&gt;=1.3.1)</li> <li>scikit-learn-intelex (&gt;=2023.2.1)</li> <li>scipy (&gt;=1.10.1)</li> <li>shap (&gt;=0.43.0)</li> <li>sktime (&gt;=0.24.0)</li> <li>zoofs (&gt;=0.1.26)</li> </ul>"}, {"location": "dependencies/#optional", "title": "Optional", "text": "<p>Some specific models, utility methods or plots require the installation of additional libraries. You can install all the optional dependencies using <code>pip install atom-ml[full]</code>. Doing so also installs the following libraries:</p> <ul> <li>botorch (&gt;=0.8.5)</li> <li>catboost (&gt;=1.2)</li> <li>explainerdashboard (&gt;=0.4.3)</li> <li>gradio (&gt;=3.44.4)</li> <li>lightgbm (&gt;=4.1.0)</li> <li>pmdarima (&gt;=2.0.3)</li> <li>schemdraw (&gt;=0.16)</li> <li>sweetviz (&gt;=2.3.1)</li> <li>wordcloud (&gt;=1.9.2)</li> <li>xgboost (&gt;=2.0.0)</li> </ul>"}, {"location": "dependencies/#development", "title": "Development", "text": "<p>The development dependencies are not installed with the package, and are not required for any of its functionalities. These libraries are only necessary to contribute to the project. Install them running <code>pdm install --dev</code> (don't forget to install pdm with <code>pip install -U pdm</code>).</p> <p>Linting</p> <ul> <li>isort (&gt;=5.12.0)</li> <li>flake8 (&gt;=6.0.0)</li> <li>flake8-pyproject (&gt;=1.2.3)</li> <li>pydocstyle (&gt;=6.3.0)</li> <li>mypy (&gt;=1.6.1)</li> <li>pandas_stubs (&gt;=2.1.1.230928)</li> <li>types-requests (&gt;=2.31.0.10)</li> </ul> <p>Testing</p> <ul> <li>nbmake (&gt;=1.4.1)</li> <li>pytest (&gt;=7.2.1)</li> <li>pytest-cov (&gt;=4.0.0)</li> <li>pytest-xdist (&gt;=3.2.0)</li> <li>scikeras (&gt;=0.11.0)</li> <li>tensorflow (&gt;=2.13.0)</li> </ul> <p>Documentation</p> <ul> <li>jupyter-contrib-nbextensions (&gt;=0.7.0)</li> <li>mike (&gt;=1.1.2)</li> <li>mkdocs (&gt;=1.5.3)</li> <li>mkdocs-autorefs (&gt;=0.5.0)</li> <li>mkdocs-jupyter (&gt;=0.24.6)</li> <li>mkdocs-material (&gt;=9.4.7)</li> <li>mkdocs-simple-hooks (&gt;=0.1.5)</li> <li>pymdown-extensions (&gt;=10.3.1)</li> <li>pyyaml (&gt;=6.0)</li> </ul>"}, {"location": "faq/", "title": "Frequently asked questions", "text": "<p>Here we try to give answers to some questions that have popped up regularly. If you have any other questions, don't hesitate to create a new discussion or post them on the Slack channel! </p> <p>??? faq Is this package related to the Atom text editor?\"     There is, indeed, a text editor with the same name and a similar logo as     this package. Is this a shameless copy? No. When I started the project,     I didn't know about the text editor, and it doesn't require much thinking     to come up with the idea of replacing the letter O of the word atom with     the image of an atom.</p> How does ATOM relate to AutoML? <p>ATOM is not an AutoML tool since it does not automate the search for an optimal pipeline like well-known AutoML tools such as auto-sklearn or EvalML do. Instead, ATOM helps the user find the optimal pipeline himself. One of the goals of this package is to help data scientists produce explainable pipelines, and using an AutoML black box function would impede that.</p> Is it possible to run deep learning models? <p>Yes. Deep learning models can be added as custom models to the pipeline as long as they follow sklearn's API. For more information, see the deep learning section of the user guide.</p> Can I run atom's methods on just a subset of the columns? <p>Yes, all data cleaning and feature engineering methods accept a <code>columns</code> parameter to only transform the selected features. For example, to only impute the numerical columns in the dataset we could type <code>atom.impute(strat_num=\"mean\", columns=atom.numerical)</code>. The parameter accepts column names, column indices, dtypes or a slice object.</p> How can I compare the same model on different datasets? <p>In many occasions you might want to test how a model performs on datasets processed with different pipelines. For this, atom has the branch system. Create a new branch for every new pipeline you want to test and use the plot methods to compare all models, independent of the branch it was trained on.</p> Can I train models through atom using a GPU? <p>Yes. Refer to the user guide to see what algorithms and models have a GPU implementation. Be aware that it could require additional software and hardware dependencies.</p> How are numerical and categorical columns differentiated? <p>The columns are separated using a dataframe's select_dtypes method. Numerical columns are selected using <code>include=\"number\"</code> whereas categorical columns are selected using <code>exclude=\"number\"</code>.</p> Can I run unsupervised learning pipelines? <p>No. As for now, ATOM only supports supervised machine learning pipelines. However, various unsupervised algorithms can be chosen as strategy in the Pruner class to detect and remove outliers from the dataset.</p> Is there a way to plot multiple models in the same shap plot? <p>No. Unfortunately, there is no way to plot multiple models in the same shap plot since the plots are made by the shap package and passed as <code>matplotlib.axes</code> objects to atom. This means that it's not within the reach of this package to implement such a utility.</p> Can I merge a sklearn pipeline with atom? <p>Yes. Like any other transformer, it is possible to add a sklearn pipeline to atom using the add method. Every transformer in the pipeline is merged independently. The pipeline is not allowed to end with a model since atom manages its own models. If that is the case, add the pipeline using <code>atom.add(pipeline[:-1])</code>.</p> Is it possible to initialize atom with an existing train and test set? <p>Yes. If you already have a separated train and test set you can initialize atom in two ways:</p> <ul> <li><code>atom = ATOMClassifier(train, test)</code></li> <li><code>atom = ATOMClassifier((X_train, y_train), (X_test, y_test))</code></li> </ul> <p>Make sure the train and test size have the same number of columns! If atom is initialized in any of these two ways, the <code>test_size</code> parameter is ignored.</p> Can I train the models using cross-validation? <p>Applying cross-validation means transforming every step of the pipeline multiple times, each with different results. Doing this would prevent ATOM from being able to show the transformation results after every pre-processing step, which means losing the ability to inspect how a transformer changed the dataset. For this reason, it is not possible to apply cross-validation until after a model has been trained. After a model has been trained, the pipeline is defined, and cross-validation can be applied using the cross_validate method. See here an example using cross-validation.</p> Is there a way to process datetime features? <p>Yes, the FeatureExtractor class can automatically extract useful features (day, month, year, etc...) from datetime columns. The extracted features are always encoded to numerical values, so they can be fed directly to a model.</p>"}, {"location": "getting_started/", "title": "Getting started", "text": ""}, {"location": "getting_started/#installation", "title": "Installation", "text": "<p>Install ATOM's newest release easily via <code>pip</code>:</p> <pre><code>pip install -U atom-ml\n</code></pre> <p>or via <code>conda</code>:</p> <pre><code>conda install -c conda-forge atom-ml\n</code></pre> <p>Note</p> <p>Since atom was already taken, download the package under the name <code>atom-ml</code>!</p> <p>Warning</p> <p>ATOM makes use of many other ML libraries, making its dependency list quite long. Because of that, the installation may take longer than you are accustomed to. Be patient!</p> <p></p> <p>Optional dependencies</p> <p>Some specific models, utility methods or plots require the installation of additional libraries. To install the optional dependencies, add <code>[full]</code> after the package's name.</p> <pre><code>pip install -U atom-ml[full]\n</code></pre> <p></p> <p>Latest source</p> <p>Sometimes, new features and bug fixes are already implemented in the <code>development</code> branch, but waiting for the next release to be made available. If you can't wait for that, it's possible to install the package directly from git.</p> <pre><code>pip install git+https://github.com/tvdboom/ATOM.git@development#egg=atom-ml\n</code></pre> <p>Don't forget to include <code>#egg=atom-ml</code> to explicitly name the project, this way pip can track metadata for it without having to have run the <code>setup.py</code> script.</p> <p></p> <p>Contributing</p> <p>If you are planning to contribute to the project, you'll need the development dependencies. Install them adding <code>[dev]</code> after the package's name.</p> <pre><code>pip install -U atom-ml[dev]\n</code></pre> <p>Click here for a complete list of package files for all versions published on PyPI.</p> <p></p>"}, {"location": "getting_started/#usage", "title": "Usage", "text": "<p>ATOM contains a variety of classes and functions to perform data cleaning, feature engineering, model training, plotting and much more. The easiest way to use everything ATOM has to offer is through one of the main classes:</p> <ul> <li>ATOMClassifier for classification tasks.</li> <li>ATOMForecaster for forecasting tasks.</li> <li>ATOMRegressor for regression tasks.</li> </ul> <p>Let's walk you through an example. Click on the SageMaker Studio Lab badge on top of this section to run this example yourself.</p> <p>Make the necessary imports and load the data.</p> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from atom import ATOMClassifier\n\n&gt;&gt;&gt; # Load the Australian Weather dataset\n&gt;&gt;&gt; X = pd.read_csv(\"./examples/datasets/weatherAUS.csv\", nrows=100)\n&gt;&gt;&gt; print(X.head())\n\n           Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine WindGustDir  WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm RainToday  RainTomorrow\n0  MelbourneAirport     18.0     26.9      21.4          7.0       8.9         SSE           41.0          W        SSE           9.0          20.0         95.0         54.0       1019.5       1017.0       8.0       5.0     18.5     26.0       Yes             0\n1          Adelaide     17.2     23.4       0.0          NaN       NaN           S           41.0          S        WSW          13.0          19.0         59.0         36.0       1015.7       1015.7       NaN       NaN     17.7     21.9        No             0\n2            Cairns     18.6     24.6       7.4          3.0       6.1         SSE           54.0        SSE         SE          26.0          35.0         78.0         57.0       1018.7       1016.6       3.0       3.0     20.8     24.1       Yes             0\n3          Portland     13.6     16.8       4.2          1.2       0.0         ESE           39.0        ESE        ESE          17.0          15.0         76.0         74.0       1021.4       1020.5       7.0       8.0     15.6     16.0       Yes             1\n4           Walpole     16.4     19.9       0.0          NaN       NaN          SE           44.0         SE         SE          19.0          30.0         78.0         70.0       1019.4       1018.9       NaN       NaN     17.4     18.1        No             0\n</code></pre> <p>Initialize the ATOMClassifier or ATOMRegressor class. These two classes are convenient wrappers for the whole machine learning pipeline. Contrary to sklearn's API, they are initialized providing the data you want to manipulate.</p> <pre><code>&gt;&gt;&gt; atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (100, 22)\nTrain set size: 80\nTest set size: 20\n-------------------------------------\nMemory: 17.73 kB\nScaled: False\nMissing values: 193 (8.8%)\nCategorical features: 5 (23.8%)\n</code></pre> <p>Data transformations are applied through atom's methods. For example, calling the impute method will initialize an Imputer instance, fit it on the training set and transform the whole dataset. The transformations are applied immediately after calling the method (no fit and transform commands necessary).</p> <pre><code>&gt;&gt;&gt; atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")  \n\nFitting Imputer...\nImputing missing values...\n --&gt; Imputing 1 missing values with median (0.0) in feature Rainfall.\n --&gt; Imputing 36 missing values with median (4.8) in feature Evaporation.\n --&gt; Imputing 38 missing values with median (8.45) in feature Sunshine.\n --&gt; Imputing 8 missing values with most_frequent (SSE) in feature WindGustDir.\n --&gt; Imputing 8 missing values with median (41.0) in feature WindGustSpeed.\n --&gt; Imputing 7 missing values with most_frequent (ESE) in feature WindDir9am.\n --&gt; Imputing 2 missing values with median (13.0) in feature WindSpeed9am.\n --&gt; Imputing 1 missing values with median (74.0) in feature Humidity9am.\n --&gt; Imputing 6 missing values with median (1017.55) in feature Pressure9am.\n --&gt; Imputing 6 missing values with median (1015.4) in feature Pressure3pm.\n --&gt; Imputing 38 missing values with median (5.5) in feature Cloud9am.\n --&gt; Imputing 40 missing values with median (5.0) in feature Cloud3pm.\n --&gt; Imputing 1 missing values with median (17.2) in feature Temp9am.\n --&gt; Imputing 1 missing values with most_frequent (No) in feature RainToday.\n\n&gt;&gt;&gt; atom.encode(strategy=\"Target\", max_onehot=8)\n\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 42 classes.\n   --&gt; Handling 2 unknown classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n   --&gt; Handling 1 unknown classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n</code></pre> <p>Similarly, models are trained and evaluated using the run method. Here, we fit both a LogisticRegression and LinearDiscriminantAnalysis model, and apply hyperparameter tuning.</p> <pre><code>&gt;&gt;&gt; atom.run(models=[\"LR\", \"LDA\"], metric=\"auc\", n_trials=6)\n\n\nTraining ========================= &gt;&gt;\nModels: LR, LDA\nMetric: auc\n\n\nRunning hyperparameter tuning for LogisticRegression...\n| trial | penalty |       C |  solver | max_iter | l1_ratio |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------- | ------- | ------- | -------- | -------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |      l2 |  1.1302 |     sag |      730 |      0.3 |  0.5417 |   0.5417 |     0.093s |  0.093s | COMPLETE |\n| 1     |    None |  0.1544 |   lbfgs |      120 |      0.5 |  0.8542 |   0.8542 |     0.092s |  0.185s | COMPLETE |\n| 2     |      l2 |  0.0027 |     sag |      460 |      0.4 |  0.5625 |   0.8542 |     0.090s |  0.275s | COMPLETE |\n| 3     |      l2 |  0.0062 |   lbfgs |      800 |      0.8 |  0.6042 |   0.8542 |     0.090s |  0.365s | COMPLETE |\n| 4     | elast.. |  4.2724 |    saga |      530 |      0.1 |  0.6042 |   0.8542 |     0.096s |  0.461s | COMPLETE |\n| 5     |      l2 |  1.3274 | newto.. |      680 |      0.3 |  0.5625 |   0.8542 |     0.093s |  0.555s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 1\nBest parameters:\n --&gt; penalty: None\n --&gt; C: 0.1544\n --&gt; solver: lbfgs\n --&gt; max_iter: 120\n --&gt; l1_ratio: 0.5\nBest evaluation --&gt; auc: 0.8542\nTime elapsed: 0.555s\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 1.0\nTest evaluation --&gt; auc: 0.4133\nTime elapsed: 0.074s\n-------------------------------------------------\nTime: 0.629s\n\n\nRunning hyperparameter tuning for LinearDiscriminantAnalysis...\n| trial |  solver | shrinkage |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |     svd |      None |  0.6458 |   0.6458 |     0.086s |  0.086s | COMPLETE |\n| 1     |    lsqr |       0.7 |  0.9375 |   0.9375 |     0.081s |  0.167s | COMPLETE |\n| 2     |     svd |       nan |  0.6458 |   0.9375 |     0.001s |  0.168s | COMPLETE |\n| 3     |    lsqr |       0.8 |   0.625 |   0.9375 |     0.079s |  0.247s | COMPLETE |\n| 4     |     svd |       nan |  0.6458 |   0.9375 |     0.000s |  0.247s | COMPLETE |\n| 5     |   eigen |       0.8 |    0.75 |   0.9375 |     0.078s |  0.326s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 1\nBest parameters:\n --&gt; solver: lsqr\n --&gt; shrinkage: 0.7\nBest evaluation --&gt; auc: 0.9375\nTime elapsed: 0.326s\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.8576\nTest evaluation --&gt; auc: 0.8933\nTime elapsed: 0.016s\n-------------------------------------------------\nTime: 0.342s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.005s\n-------------------------------------\nLogisticRegression         --&gt; auc: 0.4133 ~\nLinearDiscriminantAnalysis --&gt; auc: 0.8933 !\n</code></pre> <p>And lastly, analyze the results.</p> <pre><code>&gt;&gt;&gt; print(atom.evaluate())\n\n     accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR       0.60  0.2793  0.4000  0.0000      0.0 -0.2425       0.00     0.0  0.4667\nLDA      0.85  0.7944  0.7667  0.6667      0.5  0.5774       0.75     0.6  0.9067\n\n\n&gt;&gt;&gt; atom.plot_lift()\n</code></pre>"}, {"location": "license/", "title": "MIT License", "text": "<p>Copyright \u00a9 2023 Mavs</p> <p>Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:</p> <p>The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.</p> <p>THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.</p>"}, {"location": "API/ATOM/atomclassifier/", "title": "ATOMClassifier", "text": "<p>class atom.api.ATOMClassifier(*arrays, y=-1, index=False, shuffle=True, stratify=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for classification tasks.</p> <p>Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.</p> <p>All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.</p> <p>Parameters*arrays: sequence of indexables Dataset containing features and target. Allowed formats are: <ul> <li>X</li> <li>X, y</li> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).</p> <p>y: int, str or sequence Target column corresponding to `X`.</p> <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>y: int, str, dict, sequence or dataframe, default=-1 Target column corresponding to `X`. <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>This parameter is ignored if the target column is provided through <code>arrays</code>.</p> <p>index: bool, int, str or sequence, default=False Handle the index in the resulting dataframe. <ul> <li>If False: Reset to RangeIndex.</li> <li>If True: Use the provided index.</li> <li>If int: Position of the column to use as index.</li> <li>If str: Name of the column to use as index.</li> <li>If sequence: Array with shape=(n_samples,) to use as index.</li> </ul> <p>test_size: int or float, default=0.2 <ul> <li>If &lt;=1: Fraction of the dataset to include in the test set.</li> <li>If &gt;1: Number of rows to include in the test set.</li> </ul> <p>This parameter is ignored if the test set is provided through <code>arrays</code>.</p> <p>holdout_size: int, float or None, default=None <ul> <li>If None: No holdout data set is kept apart.</li> <li>If &lt;=1: Fraction of the dataset to include in the holdout set.</li> <li>If &gt;1: Number of rows to include in the holdout set.</li> </ul> <p>This parameter is ignored if the holdout set is provided through <code>arrays</code>.</p> <p>shuffle: bool, default=True Whether to shuffle the dataset before splitting the train and test set. Be aware that not shuffling the dataset can cause an unequal distribution of target classes over the sets. <p>stratify: bool, int, str or sequence, default=True Handle stratification of the target classes over the data sets. <ul> <li>If False: The data is split randomly.</li> <li>If True: The data is stratified over the target column.</li> <li>Else: Name or position of the columns to stratify by. The   columns can't contain <code>NaN</code> values.</li> </ul> <p>This parameter is ignored if <code>shuffle=False</code> or if the test set is provided through <code>arrays</code>.</p> <p>For multioutput tasks, stratification is applied to the joint target columns.</p> <p>n_rows: int or float, default=1 Random subsample of the dataset to use. The default value selects all rows. <ul> <li>If &lt;=1: Fraction of the dataset to select.</li> <li>If &gt;1: Exact number of rows to select. Only if <code>arrays</code> is X          or X, y.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMForecaster Main class for forecasting tasks.</p> <p>ATOMRegressor Main class for regression tasks.</p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Initialize atom\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 180 (1.3%)\n\n\n\n&gt;&gt;&gt; # Apply data cleaning and feature engineering methods\n&gt;&gt;&gt; atom.balance(strategy=\"smote\")\n\nOversampling with SMOTE...\n --&gt; Adding 116 samples to class 0.\n\n&gt;&gt;&gt; atom.feature_selection(strategy=\"rfe\", solver=\"lr\", n_features=22)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --&gt; rfe selected 22 features from the dataset.\n   --&gt; Dropping feature mean area (rank 7).\n   --&gt; Dropping feature mean compactness (rank 2).\n   --&gt; Dropping feature mean fractal dimension (rank 6).\n   --&gt; Dropping feature smoothness error (rank 9).\n   --&gt; Dropping feature concave points error (rank 4).\n   --&gt; Dropping feature fractal dimension error (rank 8).\n   --&gt; Dropping feature worst radius (rank 3).\n   --&gt; Dropping feature worst area (rank 5).\n\n\n&gt;&gt;&gt; # Train models\n&gt;&gt;&gt; atom.run(models=[\"LR\", \"RF\", \"XGB\"])\n\n\nTraining ========================= &gt;&gt;\nModels: LR, RF, XGB\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9878\nTest evaluation --&gt; f1: 0.9859\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9714\nTime elapsed: 0.251s\n-------------------------------------------------\nTime: 0.251s\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9718\nTime elapsed: 0.412s\n-------------------------------------------------\nTime: 0.412s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.759s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9859 !\nRandomForest       --&gt; f1: 0.9714\nXGBoost            --&gt; f1: 0.9718\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(atom.results)\n\n     f1_train  f1_test  time_fit      time\nLR     0.9878   0.9859  0.086078  0.086078\nRF     1.0000   0.9714  0.251238  0.251238\nXGB    1.0000   0.9718  0.412373  0.412373\n\n\n&gt;&gt;&gt; print(atom.evaluate())\n\n     accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR     0.9823  0.9975  0.9811  0.9859   0.9722  0.9621     0.9859  0.9859  0.9960\nRF     0.9646  0.9704  0.9670  0.9714   0.9444  0.9256     0.9855  0.9577  0.9670\nXGB    0.9646  0.9622  0.9621  0.9718   0.9452  0.9242     0.9718  0.9718  0.9621\n</code></pre>"}, {"location": "API/ATOM/atomclassifier/#magic-methods", "title": "Magic methods", "text": "<p>The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.</p> <ul> <li>__repr__: Prints an overview of atom's branches, models and metric.</li> <li>__len__: Returns the length of the dataset.</li> <li>__iter__: Iterate over the pipeline's transformers.</li> <li>__contains__: Checks if the provided item is a column in the dataset.</li> <li>__getitem__: Access a branch, model, column or subset of the dataset.</li> </ul> <p></p>"}, {"location": "API/ATOM/atomclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomclassifier/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributespipeline: PipelinePipeline of transforms. <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). scaled: boolWhether the feature set is scaled. <p>A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only zeros and ones) are excluded from the calculation. duplicates: int | numpy.integerNumber of duplicate rows in the dataset. missing: list[Any]Values that are considered \"missing\". <p>These values are used by the clean and impute methods. Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. nans: Series | modin.pandas.series.SeriesColumns with the number of missing values in them. <p>This property is unavailable for sparse datasets. n_nans: intNumber of rows containing missing values. <p>This property is unavailable for sparse datasets. numerical: IndexNames of the numerical features in the dataset. n_numerical: intNumber of numerical features in the dataset. categorical: IndexNames of the categorical features in the dataset. n_categorical: intNumber of categorical features in the dataset. outliers: SeriesColumns in training set with number of outlier values. <p>This property is unavailable for sparse datasets. n_outliers: int | numpy.integerNumber of samples in the training set containing outliers. <p>This property is unavailable for sparse datasets. classes: DataFrameDistribution of target classes per data set. <p>This property is only available for classification tasks. n_classes: int | numpy.integer | Series | modin.pandas.series.SeriesNumber of classes in the target column(s). <p>This property is only available for classification tasks. </p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesbranch: BranchCurrent active branch. <p>Use the property's <code>@setter</code> to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use <code>_from_</code> to split the new branch from any other existing branch. Read more in the user guide. models: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/ATOM/atomclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#utility-methods", "title": "Utility methods", "text": "<p>Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.</p> <p>addAdd a transformer to the pipeline.applyApply a function to the dataset.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoad an atom instance from a pickle file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a <code>.csv</code> file.shrinkConvert the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.</p> <p>If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.</p> <p>Warning</p> <ul> <li>The transformer should have fit and/or transform methods   with arguments <code>X</code> (accepting a dataframe-like object of   shape=(n_samples, n_features)) and/or <code>y</code> (accepting a   sequence of shape=(n_samples,)).</li> <li>The transform method should return a feature set as a   dataframe-like object of shape=(n_samples, n_features)   and/or a target column as a sequence of shape=(n_samples,).</li> </ul> <p>Note</p> <p>If the transform method doesn't return a dataframe:</p> <ul> <li>The column naming happens as follows. If the transformer   has a <code>get_feature_names_out</code> method, it is used. If not,   and it returns the same number of columns, the names are   kept equal. If the number of columns changes, old columns   will keep their name (as long as the column is unchanged)   and new columns will receive the name <code>x[N-1]</code>, where N   stands for the n-th feature. This means that a transformer   should only transform, add or drop columns, not   combinations of these.</li> <li>The index remains the same as before the transformation.   This means that the transformer should not add, remove or   shuffle rows unless it returns a dataframe.</li> </ul> <p>Note</p> <p>If the transformer has a <code>n_jobs</code> and/or <code>random_state</code> parameter that is left to its default value, it adopts atom's value.</p> <p>Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a <code>transform</code> method. <p>columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns. <p>train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data. <p>**fit_params Additional keyword arguments for the transformer's fit method. </p> <p></p> <p>method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.</p> <p>This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...</p> <p>Note</p> <p>This approach is preferred over changing the dataset directly through the property's <code>@setter</code> since the transformation is stored in the pipeline.</p> <p>Tip</p> <p>Use <code>atom.apply(lambda df: df.drop(\"column_name\",axis=1))</code> to store the removal of columns in the pipeline.</p> <p>Parametersfunc: callable Function to apply with signature <code>func(dataset, **kw_args) -&gt; dataset</code>. <p>inverse_func: callable or None, default=None Inverse function of <code>func</code>. If None, the inverse_transform method returns the input unchanged. <p>kw_args: dict or None, default=None Additional keyword arguments for the function. <p>inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function. </p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.</p> <p>Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.</p> <p>Tip</p> <p>Use the plot_distribution method to plot a column's distribution.</p> <p>Parametersdistributions: str, sequence or None, default=None Names of the distributions in <code>scipy.stats</code> to get the statistics on. If None, a selection of the most common ones is used. <p>columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to perform the test on. If None, select all numerical columns. <p>Returnspd.DataFrame Statistic results with multiindex levels: <ul> <li>dist: Name of the distribution.</li> <li>stat: Statistic results:<ul> <li>score: KS-test score.</li> <li>p_value: Corresponding p-value. </li> </ul> </li> </ul> <p></p> <p>method eda(rows=\"dataset\", target=0, filename=None)[source]Create an Exploratory Data Analysis report.</p> <p>ATOM uses the sweetviz package for EDA. The report is rendered directly in the notebook. It can also be accessed through the <code>report</code> attribute. It can either report one dataset or compare two datasets against each other.</p> <p>Warning</p> <p>This method can be slow for large datasets.</p> <p>Parametersrows: str, sequence or dict, default=\"dataset\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to report.</li> <li>If sequence: Names of two data sets to compare.</li> <li>If dict: Names of up to two data sets with corresponding   selection of rows to report.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. Only bool and numerical features can be used as target. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the (html) file to save. If None, don't save anything. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_sample_weight(rows=\"train\")[source]Return sample weights for a balanced data set.</p> <p>The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of <code>y</code> will be multiplied.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsseries Sequence of weights with shape=(n_samples,). </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement an <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>function atom.atom.load(filename, data=None)[source]Load an atom instance from a pickle file.</p> <p>If the instance was saved using <code>save_data=False</code>, it's possible to load new data into it and apply all data transformations.</p> <p>Info</p> <p>The loaded instance's current branch is the same branch as it was when saved.</p> <p>Parametersfilename: str or Path Filename or pathlib.Path of the pickle file. <p>data: tuple of indexables or None, default=None Original dataset as it was provided to the instance's constructor. Only use this parameter if the loaded file was saved using <code>save_data=False</code>. Allowed formats are: <ul> <li>X</li> <li>X, y</li> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).</p> <p>y: int, str or sequence Target column corresponding to `X`.</p> <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsatom Unpickled atom instance. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset(hard=False)[source]Reset the instance to it's initial state.</p> <p>Deletes all branches and models. The dataset is also reset to its form after initialization.</p> <p>Parametershard: bool, default=False If True, flushes completely the cache. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method save_data(filename=\"auto\", rows=\"dataset\", **kwargs)[source]Save the data in the current branch to a <code>.csv</code> file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows to save. <p>**kwargs Additional keyword arguments for pandas' to_csv method. </p> <p></p> <p>method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Convert the columns to the smallest possible matching dtype.</p> <p>Examples are: float64 -&gt; float32, int64 -&gt; int8, etc... Sparse arrays also transform their non-fill value. Use this method for memory optimization before saving the dataset. Note that applying transformers to the data may alter the types again.</p> <p>Parametersint2bool: bool, default=False Whether to convert <code>int</code> columns to <code>bool</code> type. Only if the values in the column are strictly in (0, 1) or (-1, 1). <p>int2uint: bool, default=False Whether to convert <code>int</code> to <code>uint</code> (unsigned integer). Only if the values in the column are strictly positive. <p>str2cat: bool, default=False Whether to convert <code>string</code> to <code>category</code>. Only if the number of categories is less than 30% of the column's length. <p>dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column. <p>columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to shrink. If None, transform all columns. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method stats()[source]Display basic information about the dataset.</p> <p></p> <p>method status()[source]Get an overview of the branches and models.</p> <p>This method prints the same information as the __repr__ and also saves it to the logger.</p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be  of use to, for example, transform only the target column.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#data-cleaning", "title": "Data cleaning", "text": "<p>The data cleaning methods can help you scale the data, handle missing values, categorical columns, outliers and unbalanced datasets. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.</p> <p>Tip</p> <p>Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.</p> <p>balanceBalance the number of rows per class in the target column.cleanApply standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.</p> <p></p> <p>method balance(strategy=\"adasyn\", **kwargs)[source]Balance the number of rows per class in the target column.</p> <p>When oversampling, the newly created samples have an increasing integer index for numerical indices, and an index of the form [estimator]_N for non-numerical indices, where N stands for the N-th sample in the data set.</p> <p>See the Balancer class for a description of the parameters.</p> <p>Warning</p> <ul> <li>The balance method does not support multioutput tasks.</li> <li>This transformation is only applied to the training set   in order to maintain the original distribution of target   classes in the test set.</li> </ul> <p>Tip</p> <p>Use atom's classes attribute for an overview of the target class distribution per data set.</p> <p></p> <p>method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Apply standard data cleaning steps on the dataset.</p> <p>Use the parameters to choose which transformations to perform. The available steps are:</p> <ul> <li>Convert dtypes to the best possible types.</li> <li>Drop columns with specific data types.</li> <li>Remove characters from column names.</li> <li>Strip categorical features from spaces.</li> <li>Drop duplicate rows.</li> <li>Drop rows with missing values in the target column.</li> <li>Encode the target column (ignored for regression tasks).</li> </ul> <p>See the Cleaner class for a description of the parameters.</p> <p></p> <p>method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.</p> <p>For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.</p> <p>See the Discretizer class for a description of the parameters.</p> <p>Tip</p> <p>Use the plot_distribution method to visualize a column's distribution and decide on the bins.</p> <p></p> <p>method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.</p> <p>The encoding type depends on the number of classes in the column:</p> <ul> <li>If n_classes=2 or ordinal feature, use Ordinal-encoding.</li> <li>If 2 &lt; n_classes &lt;= <code>max_onehot</code>, use OneHot-encoding.</li> <li>If n_classes &gt; <code>max_onehot</code>, use <code>strategy</code>-encoding.</li> </ul> <p>Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.</p> <p>See the Encoder class for a description of the parameters.</p> <p>Note</p> <p>This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.</p> <p>Tip</p> <p>Use the categorical attribute for a list of the categorical features in the dataset.</p> <p></p> <p>method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.</p> <p>Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the <code>missing</code> attribute to customize what are considered \"missing values\".</p> <p>See the Imputer class for a description of the parameters.</p> <p>Tip</p> <p>Use the nans attribute to check the amount of missing values per column.</p> <p></p> <p>method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.</p> <p>This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.</p> <p>See the Normalizer class for a description of the parameters.</p> <p>Tip</p> <p>Use the plot_distribution method to examine a column's distribution.</p> <p></p> <p>method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.</p> <p>Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.</p> <p>See the Pruner class for a description of the parameters.</p> <p>Note</p> <p>This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.</p> <p>Tip</p> <p>Use the outliers attribute to check the number of outliers per column.</p> <p></p> <p>method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.</p> <p>Apply one of sklearn's scalers. Categorical columns are ignored.</p> <p>See the Scaler class for a description of the parameters.</p> <p>Tip</p> <p>Use the scaled attribute to check whether the dataset is scaled.</p> <p></p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#nlp", "title": "NLP", "text": "<p>The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called <code>corpus</code>. Read more in the user guide.</p> <p>textcleanApply standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.</p> <p></p> <p>method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Apply standard text cleaning to the corpus.</p> <p>Transformations include normalizing characters and drop noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised.</p> <p>See the TextCleaner class for a description of the parameters.</p> <p></p> <p>method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.</p> <p>Convert words to a more uniform standard. The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.</p> <p>See the TextNormalizer class for a description of the parameters.</p> <p></p> <p>method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.</p> <p>Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>See the Tokenizer class for a description of the parameters.</p> <p></p> <p>method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.</p> <p>Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix <code>corpus_</code>. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.</p> <p>See the Vectorizer class for a description of the parameters.</p> <p></p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#feature-engineering", "title": "Feature engineering", "text": "<p>To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.</p> <p>feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.</p> <p></p> <p>method feature_extraction(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.</p> <p>Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype <code>datetime64</code> are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.</p> <p>See the FeatureExtractor class for a description of the parameters.</p> <p></p> <p>method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.</p> <p>Create new combinations of existing features to capture the non-linear relations between the original features.</p> <p>See the FeatureGenerator class for a description of the parameters.</p> <p></p> <p>method feature_grouping(groups, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.</p> <p>Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the <code>groups</code> method.</p> <p>See the FeatureGrouper class for a description of the parameters.</p> <p>Tip</p> <p>Use a regex pattern with the <code>groups</code> parameter to select groups easier, e.g., <code>atom.feature_grouping({\"group1\": \"var_.+\")</code> to select all features that start with <code>var_</code>.</p> <p></p> <p>method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.</p> <p>Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.</p> <p>See the FeatureSelector class for a description of the parameters.</p> <p>Note</p> <ul> <li>When strategy=\"univariate\" and solver=None, f_classif   or f_regression is used as default solver.</li> <li>When strategy is \"sfs\", \"rfecv\" or any of the   advanced strategies and no scoring is specified,   atom's metric (if it exists) is used as scoring.</li> </ul> <p></p> <p></p>"}, {"location": "API/ATOM/atomclassifier/#training", "title": "Training", "text": "<p>The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.</p> <p>runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.</p> <p></p> <p>method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.</p> <p>Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.</p> <p>The following steps are applied to every model:</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the DirectClassifier or DirectRegressor class for a description of the parameters.</p> <p></p> <p>method successive_halving(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.</p> <p>The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g., only using tree-based models.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.</p> <p></p> <p>method train_sizing(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.</p> <p>When training models, there is usually a trade-off between model performance and computation time; that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.</p> <p></p>"}, {"location": "API/ATOM/atomforecaster/", "title": "ATOMForecaster", "text": "<p>class atom.api.ATOMForecaster(*arrays, y=-1, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for forecasting tasks.</p> <p>Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.</p> <p>All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.</p> <p>Parameters*arrays: sequence of indexables Dataset containing exogeneous features and time series. Allowed formats are: <ul> <li>X</li> <li>y</li> <li>X, y</li> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>X, train, test: dataframe-like Exogeneous feature set corresponding to y, with shape=(n_samples, n_features).</p> <p>y: int, str or sequence Time series.</p> <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>y: int, str, dict, sequence or dataframe, default=-1 Time series. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>This parameter is ignored if the time series is provided through <code>arrays</code>.</p> <p>test_size: int or float, default=0.2 <ul> <li>If &lt;=1: Fraction of the dataset to include in the test set.</li> <li>If &gt;1: Number of rows to include in the test set.</li> </ul> <p>This parameter is ignored if the test set is provided through <code>arrays</code>.</p> <p>holdout_size: int, float or None, default=None <ul> <li>If None: No holdout data set is kept apart.</li> <li>If &lt;=1: Fraction of the dataset to include in the holdout set.</li> <li>If &gt;1: Number of rows to include in the holdout set.</li> </ul> <p>This parameter is ignored if the holdout set is provided through <code>arrays</code>.</p> <p>n_rows: int or float, default=1 Subsample of the dataset to use. The cut is made from the head of the dataset (older entries are dropped when sorted by date ascending). The default value selects all rows. <ul> <li>If &lt;=1: Fraction of the dataset to select.</li> <li>If &gt;1: Exact number of rows to select. Only if <code>arrays</code> is X          or X, y.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMClassifier Main class for classification tasks.</p> <p>ATOMRegressor Main class for regression tasks.</p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; # Initialize atom\n&gt;&gt;&gt; atom = ATOMForecaster(y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Univariate forecast.\n\nDataset stats ==================== &gt;&gt;\nShape: (144, 1)\nTrain set size: 116\n --&gt; From: 1949-01  To: 1958-08\nTest set size: 28\n --&gt; From: 1958-09  To: 1960-12\n-------------------------------------\nMemory: 6.47 kB\nDuplicates: 26 (18.1%)\n\n\n\n&gt;&gt;&gt; # Train models\n&gt;&gt;&gt; atom.run(models=[\"NF\", \"ES\", \"ETS\"])\n\n\nTraining ========================= &gt;&gt;\nModels: NF, ES, ETS\nMetric: mape\n\n\nResults for NaiveForecaster:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0858\nTest evaluation --&gt; mape: -0.2305\nTime elapsed: 0.025s\n-------------------------------------------------\nTime: 0.025s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0864\nTest evaluation --&gt; mape: -0.2303\nTime elapsed: 0.042s\n-------------------------------------------------\nTime: 0.042s\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0858\nTest evaluation --&gt; mape: -0.2305\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.090s\n-------------------------------------\nNaiveForecaster      --&gt; mape: -0.2305\nExponentialSmoothing --&gt; mape: -0.2303 !\nETS                  --&gt; mape: -0.2305\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(atom.results)\n\n     mape_train  mape_test  time_fit      time\nNF      -0.0858    -0.2305  0.025023  0.025023\nES      -0.0864    -0.2303  0.042052  0.042052\nETS     -0.0858    -0.2305  0.021019  0.021019\n\n\n&gt;&gt;&gt; print(atom.evaluate())\n\n         mae    mape         mse      r2      rmse\nNF  -91.8571 -0.2305 -10656.7143 -0.7278 -103.2314\nES  -91.8163 -0.2303 -10647.1506 -0.7263 -103.1850\nETS -91.8563 -0.2305 -10656.5266 -0.7278 -103.2305\n</code></pre>"}, {"location": "API/ATOM/atomforecaster/#magic-methods", "title": "Magic methods", "text": "<p>The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.</p> <ul> <li>__repr__: Prints an overview of atom's branches, models and metric.</li> <li>__len__: Returns the length of the dataset.</li> <li>__iter__: Iterate over the pipeline's transformers.</li> <li>__contains__: Checks if the provided item is a column in the dataset.</li> <li>__getitem__: Access a branch, model, column or subset of the dataset.</li> </ul> <p></p>"}, {"location": "API/ATOM/atomforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomforecaster/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributespipeline: PipelinePipeline of transforms. <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). scaled: boolWhether the feature set is scaled. <p>A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only zeros and ones) are excluded from the calculation. duplicates: int | numpy.integerNumber of duplicate rows in the dataset. missing: list[Any]Values that are considered \"missing\". <p>These values are used by the clean and impute methods. Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. nans: Series | modin.pandas.series.SeriesColumns with the number of missing values in them. <p>This property is unavailable for sparse datasets. n_nans: intNumber of rows containing missing values. <p>This property is unavailable for sparse datasets. numerical: IndexNames of the numerical features in the dataset. n_numerical: intNumber of numerical features in the dataset. categorical: IndexNames of the categorical features in the dataset. n_categorical: intNumber of categorical features in the dataset. outliers: SeriesColumns in training set with number of outlier values. <p>This property is unavailable for sparse datasets. n_outliers: int | numpy.integerNumber of samples in the training set containing outliers. <p>This property is unavailable for sparse datasets. </p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesbranch: BranchCurrent active branch. <p>Use the property's <code>@setter</code> to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use <code>_from_</code> to split the new branch from any other existing branch. Read more in the user guide. models: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/ATOM/atomforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#utility-methods", "title": "Utility methods", "text": "<p>Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.</p> <p>addAdd a transformer to the pipeline.applyApply a function to the dataset.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoad an atom instance from a pickle file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a <code>.csv</code> file.shrinkConvert the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.</p> <p>If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.</p> <p>Warning</p> <ul> <li>The transformer should have fit and/or transform methods   with arguments <code>X</code> (accepting a dataframe-like object of   shape=(n_samples, n_features)) and/or <code>y</code> (accepting a   sequence of shape=(n_samples,)).</li> <li>The transform method should return a feature set as a   dataframe-like object of shape=(n_samples, n_features)   and/or a target column as a sequence of shape=(n_samples,).</li> </ul> <p>Note</p> <p>If the transform method doesn't return a dataframe:</p> <ul> <li>The column naming happens as follows. If the transformer   has a <code>get_feature_names_out</code> method, it is used. If not,   and it returns the same number of columns, the names are   kept equal. If the number of columns changes, old columns   will keep their name (as long as the column is unchanged)   and new columns will receive the name <code>x[N-1]</code>, where N   stands for the n-th feature. This means that a transformer   should only transform, add or drop columns, not   combinations of these.</li> <li>The index remains the same as before the transformation.   This means that the transformer should not add, remove or   shuffle rows unless it returns a dataframe.</li> </ul> <p>Note</p> <p>If the transformer has a <code>n_jobs</code> and/or <code>random_state</code> parameter that is left to its default value, it adopts atom's value.</p> <p>Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a <code>transform</code> method. <p>columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns. <p>train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data. <p>**fit_params Additional keyword arguments for the transformer's fit method. </p> <p></p> <p>method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.</p> <p>This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...</p> <p>Note</p> <p>This approach is preferred over changing the dataset directly through the property's <code>@setter</code> since the transformation is stored in the pipeline.</p> <p>Tip</p> <p>Use <code>atom.apply(lambda df: df.drop(\"column_name\",axis=1))</code> to store the removal of columns in the pipeline.</p> <p>Parametersfunc: callable Function to apply with signature <code>func(dataset, **kw_args) -&gt; dataset</code>. <p>inverse_func: callable or None, default=None Inverse function of <code>func</code>. If None, the inverse_transform method returns the input unchanged. <p>kw_args: dict or None, default=None Additional keyword arguments for the function. <p>inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function. </p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.</p> <p>Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.</p> <p>Tip</p> <p>Use the plot_distribution method to plot a column's distribution.</p> <p>Parametersdistributions: str, sequence or None, default=None Names of the distributions in <code>scipy.stats</code> to get the statistics on. If None, a selection of the most common ones is used. <p>columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to perform the test on. If None, select all numerical columns. <p>Returnspd.DataFrame Statistic results with multiindex levels: <ul> <li>dist: Name of the distribution.</li> <li>stat: Statistic results:<ul> <li>score: KS-test score.</li> <li>p_value: Corresponding p-value. </li> </ul> </li> </ul> <p></p> <p>method eda(rows=\"dataset\", target=0, filename=None)[source]Create an Exploratory Data Analysis report.</p> <p>ATOM uses the sweetviz package for EDA. The report is rendered directly in the notebook. It can also be accessed through the <code>report</code> attribute. It can either report one dataset or compare two datasets against each other.</p> <p>Warning</p> <p>This method can be slow for large datasets.</p> <p>Parametersrows: str, sequence or dict, default=\"dataset\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to report.</li> <li>If sequence: Names of two data sets to compare.</li> <li>If dict: Names of up to two data sets with corresponding   selection of rows to report.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. Only bool and numerical features can be used as target. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the (html) file to save. If None, don't save anything. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_sample_weight(rows=\"train\")[source]Return sample weights for a balanced data set.</p> <p>The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of <code>y</code> will be multiplied.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsseries Sequence of weights with shape=(n_samples,). </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement an <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>function atom.atom.load(filename, data=None)[source]Load an atom instance from a pickle file.</p> <p>If the instance was saved using <code>save_data=False</code>, it's possible to load new data into it and apply all data transformations.</p> <p>Info</p> <p>The loaded instance's current branch is the same branch as it was when saved.</p> <p>Parametersfilename: str or Path Filename or pathlib.Path of the pickle file. <p>data: tuple of indexables or None, default=None Original dataset as it was provided to the instance's constructor. Only use this parameter if the loaded file was saved using <code>save_data=False</code>. Allowed formats are: <ul> <li>X</li> <li>X, y</li> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).</p> <p>y: int, str or sequence Target column corresponding to `X`.</p> <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsatom Unpickled atom instance. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset(hard=False)[source]Reset the instance to it's initial state.</p> <p>Deletes all branches and models. The dataset is also reset to its form after initialization.</p> <p>Parametershard: bool, default=False If True, flushes completely the cache. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method save_data(filename=\"auto\", rows=\"dataset\", **kwargs)[source]Save the data in the current branch to a <code>.csv</code> file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows to save. <p>**kwargs Additional keyword arguments for pandas' to_csv method. </p> <p></p> <p>method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Convert the columns to the smallest possible matching dtype.</p> <p>Examples are: float64 -&gt; float32, int64 -&gt; int8, etc... Sparse arrays also transform their non-fill value. Use this method for memory optimization before saving the dataset. Note that applying transformers to the data may alter the types again.</p> <p>Parametersint2bool: bool, default=False Whether to convert <code>int</code> columns to <code>bool</code> type. Only if the values in the column are strictly in (0, 1) or (-1, 1). <p>int2uint: bool, default=False Whether to convert <code>int</code> to <code>uint</code> (unsigned integer). Only if the values in the column are strictly positive. <p>str2cat: bool, default=False Whether to convert <code>string</code> to <code>category</code>. Only if the number of categories is less than 30% of the column's length. <p>dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column. <p>columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to shrink. If None, transform all columns. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method stats()[source]Display basic information about the dataset.</p> <p></p> <p>method status()[source]Get an overview of the branches and models.</p> <p>This method prints the same information as the __repr__ and also saves it to the logger.</p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be  of use to, for example, transform only the target column.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#data-cleaning", "title": "Data cleaning", "text": "<p>The data cleaning methods can help you scale the data, handle missing values, categorical columns and outliers. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.</p> <p>Tip</p> <p>Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.</p> <p>cleanApply standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.</p> <p></p> <p>method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Apply standard data cleaning steps on the dataset.</p> <p>Use the parameters to choose which transformations to perform. The available steps are:</p> <ul> <li>Convert dtypes to the best possible types.</li> <li>Drop columns with specific data types.</li> <li>Remove characters from column names.</li> <li>Strip categorical features from spaces.</li> <li>Drop duplicate rows.</li> <li>Drop rows with missing values in the target column.</li> <li>Encode the target column (ignored for regression tasks).</li> </ul> <p>See the Cleaner class for a description of the parameters.</p> <p></p> <p>method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.</p> <p>For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.</p> <p>See the Discretizer class for a description of the parameters.</p> <p>Tip</p> <p>Use the plot_distribution method to visualize a column's distribution and decide on the bins.</p> <p></p> <p>method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.</p> <p>The encoding type depends on the number of classes in the column:</p> <ul> <li>If n_classes=2 or ordinal feature, use Ordinal-encoding.</li> <li>If 2 &lt; n_classes &lt;= <code>max_onehot</code>, use OneHot-encoding.</li> <li>If n_classes &gt; <code>max_onehot</code>, use <code>strategy</code>-encoding.</li> </ul> <p>Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.</p> <p>See the Encoder class for a description of the parameters.</p> <p>Note</p> <p>This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.</p> <p>Tip</p> <p>Use the categorical attribute for a list of the categorical features in the dataset.</p> <p></p> <p>method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.</p> <p>Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the <code>missing</code> attribute to customize what are considered \"missing values\".</p> <p>See the Imputer class for a description of the parameters.</p> <p>Tip</p> <p>Use the nans attribute to check the amount of missing values per column.</p> <p></p> <p>method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.</p> <p>This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.</p> <p>See the Normalizer class for a description of the parameters.</p> <p>Tip</p> <p>Use the plot_distribution method to examine a column's distribution.</p> <p></p> <p>method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.</p> <p>Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.</p> <p>See the Pruner class for a description of the parameters.</p> <p>Note</p> <p>This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.</p> <p>Tip</p> <p>Use the outliers attribute to check the number of outliers per column.</p> <p></p> <p>method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.</p> <p>Apply one of sklearn's scalers. Categorical columns are ignored.</p> <p>See the Scaler class for a description of the parameters.</p> <p>Tip</p> <p>Use the scaled attribute to check whether the dataset is scaled.</p> <p></p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#nlp", "title": "NLP", "text": "<p>The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called <code>corpus</code>. Read more in the user guide.</p> <p>textcleanApply standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.</p> <p></p> <p>method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Apply standard text cleaning to the corpus.</p> <p>Transformations include normalizing characters and drop noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised.</p> <p>See the TextCleaner class for a description of the parameters.</p> <p></p> <p>method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.</p> <p>Convert words to a more uniform standard. The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.</p> <p>See the TextNormalizer class for a description of the parameters.</p> <p></p> <p>method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.</p> <p>Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>See the Tokenizer class for a description of the parameters.</p> <p></p> <p>method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.</p> <p>Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix <code>corpus_</code>. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.</p> <p>See the Vectorizer class for a description of the parameters.</p> <p></p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#feature-engineering", "title": "Feature engineering", "text": "<p>To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.</p> <p>feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.</p> <p></p> <p>method feature_extraction(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.</p> <p>Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype <code>datetime64</code> are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.</p> <p>See the FeatureExtractor class for a description of the parameters.</p> <p></p> <p>method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.</p> <p>Create new combinations of existing features to capture the non-linear relations between the original features.</p> <p>See the FeatureGenerator class for a description of the parameters.</p> <p></p> <p>method feature_grouping(groups, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.</p> <p>Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the <code>groups</code> method.</p> <p>See the FeatureGrouper class for a description of the parameters.</p> <p>Tip</p> <p>Use a regex pattern with the <code>groups</code> parameter to select groups easier, e.g., <code>atom.feature_grouping({\"group1\": \"var_.+\")</code> to select all features that start with <code>var_</code>.</p> <p></p> <p>method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.</p> <p>Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.</p> <p>See the FeatureSelector class for a description of the parameters.</p> <p>Note</p> <ul> <li>When strategy=\"univariate\" and solver=None, f_classif   or f_regression is used as default solver.</li> <li>When strategy is \"sfs\", \"rfecv\" or any of the   advanced strategies and no scoring is specified,   atom's metric (if it exists) is used as scoring.</li> </ul> <p></p> <p></p>"}, {"location": "API/ATOM/atomforecaster/#training", "title": "Training", "text": "<p>The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.</p> <p>runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.</p> <p></p> <p>method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.</p> <p>Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.</p> <p>The following steps are applied to every model:</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the DirectClassifier or DirectRegressor class for a description of the parameters.</p> <p></p> <p>method successive_halving(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.</p> <p>The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g., only using tree-based models.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.</p> <p></p> <p>method train_sizing(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.</p> <p>When training models, there is usually a trade-off between model performance and computation time; that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.</p> <p></p>"}, {"location": "API/ATOM/atommodel/", "title": "ATOMModel", "text": "<p>function atom.api.ATOMModel(estimator, name=None, acronym=None, needs_scaling=False, native_multilabel=False, native_multioutput=False, has_validation=None)[source]Convert an estimator to a model that can be ingested by atom.</p> <p>This function adds the relevant attributes to the estimator so that they can be used by atom. Note that only estimators that follow sklearn's API are compatible.</p> <p>Read more about custom models in the user guide.</p> <p>Parametersestimator: Predictor Custom estimator. Should implement a <code>fit</code> and <code>predict</code> method. <p>name: str or None, default=None Name for the model. This is the value used to call the model from atom. The value should start with the model's <code>acronym</code> when specified. If None, the capital letters of the estimator's name are used (only if two or more, else it uses the entire name). <p>acronym: str or None, default=None Model's acronym. If None, it uses the model's <code>name</code>. Specify this parameter when you want to train multiple custom models that share the same estimator. <p>needs_scaling: bool, default=False Whether the model should use automated feature scaling. <p>native_multilabel: bool, default=False Whether the model has native support for multilabel tasks. If False and the task is multilabel, a multilabel meta-estimator is wrapper around the estimator. <p>native_multioutput: bool, default=False Whether the model has native support for multioutput tasks. If False and the task is multioutput, a multioutput meta-estimator is wrapped around the estimator. <p>has_validation: str or None, default=None Whether the model allows in-training validation. <ul> <li>If None: No support for in-training validation.</li> <li>If str: Name of the estimator's parameter that states the   number of iterations, e.g., <code>n_estimators</code> for   RandomForestClassifier.</li> </ul> <p>ReturnsPredictor Estimator with provided information. Provide this instance to the <code>models</code> parameter of the run method. <p></p>"}, {"location": "API/ATOM/atommodel/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor, ATOMModel\n&gt;&gt;&gt; from sklearn.datasets import load_diabetes\n&gt;&gt;&gt; from sklearn.linear_model import RANSACRegressor\n\n&gt;&gt;&gt; ransac = ATOMModel(\n...     estimator=RANSACRegressor(),\n...     name=\"RANSAC\",\n...     needs_scaling=False,\n... )\n\n&gt;&gt;&gt; X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Regression.\n\nDataset stats ==================== &gt;&gt;\nShape: (442, 11)\nTrain set size: 354\nTest set size: 88\n-------------------------------------\nMemory: 39.03 kB\nScaled: False\nOutlier values: 12 (0.3%)\n\n\n&gt;&gt;&gt; atom.run(ransac)\n\n\nTraining ========================= &gt;&gt;\nModels: RANSAC\nMetric: r2\n\n\nResults for RANSACRegressor:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.2946\nTest evaluation --&gt; r2: 0.3787\nTime elapsed: 0.059s\n-------------------------------------------------\nTime: 0.059s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.060s\n-------------------------------------\nRANSACRegressor --&gt; r2: 0.3787\n</code></pre>"}, {"location": "API/ATOM/atomregressor/", "title": "ATOMRegressor", "text": "<p>class atom.api.ATOMRegressor(*arrays, y=-1, index=False, shuffle=True, n_rows=1, test_size=0.2, holdout_size=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Main class for regression tasks.</p> <p>Apply all data transformations and model management provided by the package on a given dataset. Note that, contrary to sklearn's API, the instance contains the dataset on which to perform the analysis. Calling a method will automatically apply it on the dataset it contains.</p> <p>All data cleaning, feature engineering, model training and plotting functionality can be accessed from an instance of this class.</p> <p>Parameters*arrays: sequence of indexables Dataset containing features and target. Allowed formats are: <ul> <li>X</li> <li>X, y</li> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).</p> <p>y: int, str or sequence Target column corresponding to `X`.</p> <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>y: int, str, dict, sequence or dataframe, default=-1 Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>This parameter is ignored if the target column is provided through <code>arrays</code>.</p> <p>index: bool, int, str or sequence, default=False Handle the index in the resulting dataframe. <ul> <li>If False: Reset to RangeIndex.</li> <li>If True: Use the provided index.</li> <li>If int: Position of the column to use as index.</li> <li>If str: Name of the column to use as index.</li> <li>If sequence: Array with shape=(n_samples,) to use as index.</li> </ul> <p>test_size: int or float, default=0.2 <ul> <li>If &lt;=1: Fraction of the dataset to include in the test set.</li> <li>If &gt;1: Number of rows to include in the test set.</li> </ul> <p>This parameter is ignored if the test set is provided through <code>arrays</code>.</p> <p>holdout_size: int, float or None, default=None <ul> <li>If None: No holdout data set is kept apart.</li> <li>If &lt;=1: Fraction of the dataset to include in the holdout set.</li> <li>If &gt;1: Number of rows to include in the holdout set.</li> </ul> <p>This parameter is ignored if the holdout set is provided through <code>arrays</code>.</p> <p>shuffle: bool, default=True Whether to shuffle the dataset before splitting the train and test set. Be aware that not shuffling the dataset can cause an unequal distribution of target classes over the sets. <p>n_rows: int or float, default=1 Random subsample of the dataset to use. The default value selects all rows. <ul> <li>If &lt;=1: Fraction of the dataset to select.</li> <li>If &gt;1: Exact number of rows to select. Only if <code>arrays</code> is X          or X, y.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMClassifier Main class for classification tasks.</p> <p>ATOMForecaster Main class for forecasting tasks.</p> <p></p>"}, {"location": "API/ATOM/atomregressor/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_diabetes\n\n&gt;&gt;&gt; X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Initialize atom\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Regression.\n\nDataset stats ==================== &gt;&gt;\nShape: (442, 11)\nTrain set size: 354\nTest set size: 88\n-------------------------------------\nMemory: 39.03 kB\nScaled: False\nOutlier values: 11 (0.3%)\n\n\n\n&gt;&gt;&gt; # Apply data cleaning and feature engineering methods\n&gt;&gt;&gt; atom.scale()\n\nFitting Scaler...\nScaling features...\n\n&gt;&gt;&gt; atom.feature_selection(strategy=\"rfecv\", solver=\"xgb\", n_features=12)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --&gt; rfecv selected 10 features from the dataset.\n\n\n&gt;&gt;&gt; # Train models\n&gt;&gt;&gt; atom.run(models=[\"OLS\", \"RF\", \"XGB\"])\n\n\nTraining ========================= &gt;&gt;\nModels: OLS, RF, XGB\nMetric: r2\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.5313\nTest evaluation --&gt; r2: 0.4452\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.9203\nTest evaluation --&gt; r2: 0.3471\nTime elapsed: 0.434s\n-------------------------------------------------\nTime: 0.434s\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 1.0\nTest evaluation --&gt; r2: 0.2881\nTime elapsed: 0.187s\n-------------------------------------------------\nTime: 0.187s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.645s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: 0.4452 !\nRandomForest         --&gt; r2: 0.3471 ~\nXGBoost              --&gt; r2: 0.2881 ~\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(atom.results)\n\n     r2_train  r2_test  time_fit      time\nOLS    0.5313   0.4452  0.020018  0.020018\nRF     0.9203   0.3471  0.434395  0.434395\nXGB    1.0000   0.2881  0.187170  0.187170\n\n\n&gt;&gt;&gt; print(atom.evaluate())\n\n         mae    mape        mse      r2     rmse\nOLS -45.1949 -0.4267 -3172.9439  0.4452 -56.3289\nRF  -49.8684 -0.4612 -3733.6766  0.3471 -61.1038\nXGB -52.0370 -0.4708 -4071.0416  0.2881 -63.8047\n</code></pre>"}, {"location": "API/ATOM/atomregressor/#magic-methods", "title": "Magic methods", "text": "<p>The class contains some magic methods to help you access some of its elements faster. Note that methods that apply on the pipeline can return different results per branch.</p> <ul> <li>__repr__: Prints an overview of atom's branches, models and metric.</li> <li>__len__: Returns the length of the dataset.</li> <li>__iter__: Iterate over the pipeline's transformers.</li> <li>__contains__: Checks if the provided item is a column in the dataset.</li> <li>__getitem__: Access a branch, model, column or subset of the dataset.</li> </ul> <p></p>"}, {"location": "API/ATOM/atomregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/ATOM/atomregressor/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributespipeline: PipelinePipeline of transforms. <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). scaled: boolWhether the feature set is scaled. <p>A data set is considered scaled when it has mean=0 and std=1, or when there is a scaler in the pipeline. Binary columns (only zeros and ones) are excluded from the calculation. duplicates: int | numpy.integerNumber of duplicate rows in the dataset. missing: list[Any]Values that are considered \"missing\". <p>These values are used by the clean and impute methods. Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. nans: Series | modin.pandas.series.SeriesColumns with the number of missing values in them. <p>This property is unavailable for sparse datasets. n_nans: intNumber of rows containing missing values. <p>This property is unavailable for sparse datasets. numerical: IndexNames of the numerical features in the dataset. n_numerical: intNumber of numerical features in the dataset. categorical: IndexNames of the categorical features in the dataset. n_categorical: intNumber of categorical features in the dataset. outliers: SeriesColumns in training set with number of outlier values. <p>This property is unavailable for sparse datasets. n_outliers: int | numpy.integerNumber of samples in the training set containing outliers. <p>This property is unavailable for sparse datasets. </p> <p></p>"}, {"location": "API/ATOM/atomregressor/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesbranch: BranchCurrent active branch. <p>Use the property's <code>@setter</code> to change the branch or to create a new one. If the value is the name of an existing branch, switch to that one. Else, create a new branch using that name. The new branch is split from the current branch. Use <code>_from_</code> to split the new branch from any other existing branch. Read more in the user guide. models: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/ATOM/atomregressor/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/ATOM/atomregressor/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/ATOM/atomregressor/#utility-methods", "title": "Utility methods", "text": "<p>Next to the plotting methods, the class contains a variety of utility methods to handle the data and manage the pipeline.</p> <p>addAdd a transformer to the pipeline.applyApply a function to the dataset.available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.distributionGet statistics on column distributions.edaCreate an Exploratory Data Analysis report.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_sample_weightReturn sample weights for a balanced data set.inverse_transformInversely transform new data through the pipeline.loadLoad an atom instance from a pickle file.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.resetReset the instance to it's initial state.reset_aestheticsReset the plot aesthetics to their default values.saveSave the instance to a pickle file.save_dataSave the data in the current branch to a <code>.csv</code> file.shrinkConvert the columns to the smallest possible matching dtype.stackingAdd a Stacking model to the pipeline.statsDisplay basic information about the dataset.statusGet an overview of the branches and models.transformTransform new data through the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method add(transformer, columns=None, train_only=False, **fit_params)[source]Add a transformer to the pipeline.</p> <p>If the transformer is not fitted, it is fitted on the complete training set. Afterwards, the data set is transformed and the estimator is added to atom's pipeline. If the estimator is a sklearn Pipeline, every estimator is merged independently with atom.</p> <p>Warning</p> <ul> <li>The transformer should have fit and/or transform methods   with arguments <code>X</code> (accepting a dataframe-like object of   shape=(n_samples, n_features)) and/or <code>y</code> (accepting a   sequence of shape=(n_samples,)).</li> <li>The transform method should return a feature set as a   dataframe-like object of shape=(n_samples, n_features)   and/or a target column as a sequence of shape=(n_samples,).</li> </ul> <p>Note</p> <p>If the transform method doesn't return a dataframe:</p> <ul> <li>The column naming happens as follows. If the transformer   has a <code>get_feature_names_out</code> method, it is used. If not,   and it returns the same number of columns, the names are   kept equal. If the number of columns changes, old columns   will keep their name (as long as the column is unchanged)   and new columns will receive the name <code>x[N-1]</code>, where N   stands for the n-th feature. This means that a transformer   should only transform, add or drop columns, not   combinations of these.</li> <li>The index remains the same as before the transformation.   This means that the transformer should not add, remove or   shuffle rows unless it returns a dataframe.</li> </ul> <p>Note</p> <p>If the transformer has a <code>n_jobs</code> and/or <code>random_state</code> parameter that is left to its default value, it adopts atom's value.</p> <p>Parameterstransformer: Transformer Estimator to add to the pipeline. Should implement a <code>transform</code> method. <p>columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to transform. Only select features or the target column, not both at the same time (if that happens, the target column is ignored). If None, transform all columns. <p>train_only: bool, default=False Whether to apply the estimator only on the training set or on the complete dataset. Note that if True, the transformation is skipped when making predictions on new data. <p>**fit_params Additional keyword arguments for the transformer's fit method. </p> <p></p> <p>method apply(func, inverse_func=None, kw_args=None, inv_kw_args=None, **kwargs)[source]Apply a function to the dataset.</p> <p>This method is useful for stateless transformations such as taking the log, doing custom scaling, etc...</p> <p>Note</p> <p>This approach is preferred over changing the dataset directly through the property's <code>@setter</code> since the transformation is stored in the pipeline.</p> <p>Tip</p> <p>Use <code>atom.apply(lambda df: df.drop(\"column_name\",axis=1))</code> to store the removal of columns in the pipeline.</p> <p>Parametersfunc: callable Function to apply with signature <code>func(dataset, **kw_args) -&gt; dataset</code>. <p>inverse_func: callable or None, default=None Inverse function of <code>func</code>. If None, the inverse_transform method returns the input unchanged. <p>kw_args: dict or None, default=None Additional keyword arguments for the function. <p>inv_kw_args: dict or None, default=None Additional keyword arguments for the inverse function. </p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method distribution(distributions=None, columns=None)[source]Get statistics on column distributions.</p> <p>Compute the Kolmogorov-Smirnov test for various distributions against columns in the dataset. Only for numerical columns. Missing values are ignored.</p> <p>Tip</p> <p>Use the plot_distribution method to plot a column's distribution.</p> <p>Parametersdistributions: str, sequence or None, default=None Names of the distributions in <code>scipy.stats</code> to get the statistics on. If None, a selection of the most common ones is used. <p>columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to perform the test on. If None, select all numerical columns. <p>Returnspd.DataFrame Statistic results with multiindex levels: <ul> <li>dist: Name of the distribution.</li> <li>stat: Statistic results:<ul> <li>score: KS-test score.</li> <li>p_value: Corresponding p-value. </li> </ul> </li> </ul> <p></p> <p>method eda(rows=\"dataset\", target=0, filename=None)[source]Create an Exploratory Data Analysis report.</p> <p>ATOM uses the sweetviz package for EDA. The report is rendered directly in the notebook. It can also be accessed through the <code>report</code> attribute. It can either report one dataset or compare two datasets against each other.</p> <p>Warning</p> <p>This method can be slow for large datasets.</p> <p>Parametersrows: str, sequence or dict, default=\"dataset\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to report.</li> <li>If sequence: Names of two data sets to compare.</li> <li>If dict: Names of up to two data sets with corresponding   selection of rows to report.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. Only bool and numerical features can be used as target. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the (html) file to save. If None, don't save anything. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_sample_weight(rows=\"train\")[source]Return sample weights for a balanced data set.</p> <p>The returned weights are inversely proportional to the class frequencies in the selected data set. For multioutput tasks, the weights of each column of <code>y</code> will be multiplied.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsseries Sequence of weights with shape=(n_samples,). </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement an <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be used to transform only the target column.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>function atom.atom.load(filename, data=None)[source]Load an atom instance from a pickle file.</p> <p>If the instance was saved using <code>save_data=False</code>, it's possible to load new data into it and apply all data transformations.</p> <p>Info</p> <p>The loaded instance's current branch is the same branch as it was when saved.</p> <p>Parametersfilename: str or Path Filename or pathlib.Path of the pickle file. <p>data: tuple of indexables or None, default=None Original dataset as it was provided to the instance's constructor. Only use this parameter if the loaded file was saved using <code>save_data=False</code>. Allowed formats are: <ul> <li>X</li> <li>X, y</li> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>X, train, test: dataframe-like Feature set with shape=(n_samples, n_features).</p> <p>y: int, str or sequence Target column corresponding to `X`.</p> <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsatom Unpickled atom instance. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset(hard=False)[source]Reset the instance to it's initial state.</p> <p>Deletes all branches and models. The dataset is also reset to its form after initialization.</p> <p>Parametershard: bool, default=False If True, flushes completely the cache. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method save_data(filename=\"auto\", rows=\"dataset\", **kwargs)[source]Save the data in the current branch to a <code>.csv</code> file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows to save. <p>**kwargs Additional keyword arguments for pandas' to_csv method. </p> <p></p> <p>method shrink(int2bool=False, int2uint=False, str2cat=False, dense2sparse=False, columns=None)[source]Convert the columns to the smallest possible matching dtype.</p> <p>Examples are: float64 -&gt; float32, int64 -&gt; int8, etc... Sparse arrays also transform their non-fill value. Use this method for memory optimization before saving the dataset. Note that applying transformers to the data may alter the types again.</p> <p>Parametersint2bool: bool, default=False Whether to convert <code>int</code> columns to <code>bool</code> type. Only if the values in the column are strictly in (0, 1) or (-1, 1). <p>int2uint: bool, default=False Whether to convert <code>int</code> to <code>uint</code> (unsigned integer). Only if the values in the column are strictly positive. <p>str2cat: bool, default=False Whether to convert <code>string</code> to <code>category</code>. Only if the number of categories is less than 30% of the column's length. <p>dense2sparse: bool, default=False Whether to convert all features to sparse format. The value that is compressed is the most frequent value in the column. <p>columns: int, str, segment, sequence, dataframe or None, default=None Selection of columns to shrink. If None, transform all columns. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method stats()[source]Display basic information about the dataset.</p> <p></p> <p>method status()[source]Get an overview of the branches and models.</p> <p>This method prints the same information as the __repr__ and also saves it to the logger.</p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be  of use to, for example, transform only the target column.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p> <p></p>"}, {"location": "API/ATOM/atomregressor/#data-cleaning", "title": "Data cleaning", "text": "<p>The data cleaning methods can help you scale the data, handle missing values, categorical columns and outliers. All attributes of the data cleaning classes are attached to atom after running. Read more in the user guide.</p> <p>Tip</p> <p>Use the eda method to examine the data and help you determine suitable parameters for the data cleaning methods.</p> <p>cleanApply standard data cleaning steps on the dataset.discretizeBin continuous data into intervals.encodePerform encoding of categorical features.imputeHandle missing values in the dataset.normalizeTransform the data to follow a Normal/Gaussian distribution.prunePrune outliers from the training set.scaleScale the data.</p> <p></p> <p>method clean(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, **kwargs)[source]Apply standard data cleaning steps on the dataset.</p> <p>Use the parameters to choose which transformations to perform. The available steps are:</p> <ul> <li>Convert dtypes to the best possible types.</li> <li>Drop columns with specific data types.</li> <li>Remove characters from column names.</li> <li>Strip categorical features from spaces.</li> <li>Drop duplicate rows.</li> <li>Drop rows with missing values in the target column.</li> <li>Encode the target column (ignored for regression tasks).</li> </ul> <p>See the Cleaner class for a description of the parameters.</p> <p></p> <p>method discretize(strategy=\"quantile\", bins=5, labels=None, **kwargs)[source]Bin continuous data into intervals.</p> <p>For each feature, the bin edges are computed during fit and, together with the number of bins, they will define the intervals. Ignores numerical columns.</p> <p>See the Discretizer class for a description of the parameters.</p> <p>Tip</p> <p>Use the plot_distribution method to visualize a column's distribution and decide on the bins.</p> <p></p> <p>method encode(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"rare\", **kwargs)[source]Perform encoding of categorical features.</p> <p>The encoding type depends on the number of classes in the column:</p> <ul> <li>If n_classes=2 or ordinal feature, use Ordinal-encoding.</li> <li>If 2 &lt; n_classes &lt;= <code>max_onehot</code>, use OneHot-encoding.</li> <li>If n_classes &gt; <code>max_onehot</code>, use <code>strategy</code>-encoding.</li> </ul> <p>Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Rare classes can be replaced with a value in order to prevent too high cardinality.</p> <p>See the Encoder class for a description of the parameters.</p> <p>Note</p> <p>This method only encodes the categorical features. It does not encode the target column! Use the clean method for that.</p> <p>Tip</p> <p>Use the categorical attribute for a list of the categorical features in the dataset.</p> <p></p> <p>method impute(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, **kwargs)[source]Handle missing values in the dataset.</p> <p>Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the <code>missing</code> attribute to customize what are considered \"missing values\".</p> <p>See the Imputer class for a description of the parameters.</p> <p>Tip</p> <p>Use the nans attribute to check the amount of missing values per column.</p> <p></p> <p>method normalize(strategy=\"yeojohnson\", **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.</p> <p>This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Ignores categorical columns.</p> <p>See the Normalizer class for a description of the parameters.</p> <p>Tip</p> <p>Use the plot_distribution method to examine a column's distribution.</p> <p></p> <p>method prune(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, **kwargs)[source]Prune outliers from the training set.</p> <p>Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.</p> <p>See the Pruner class for a description of the parameters.</p> <p>Note</p> <p>This transformation is only applied to the training set in order to maintain the original distribution of samples in the test set.</p> <p>Tip</p> <p>Use the outliers attribute to check the number of outliers per column.</p> <p></p> <p>method scale(strategy=\"standard\", include_binary=False, **kwargs)[source]Scale the data.</p> <p>Apply one of sklearn's scalers. Categorical columns are ignored.</p> <p>See the Scaler class for a description of the parameters.</p> <p>Tip</p> <p>Use the scaled attribute to check whether the dataset is scaled.</p> <p></p> <p></p>"}, {"location": "API/ATOM/atomregressor/#nlp", "title": "NLP", "text": "<p>The Natural Language Processing (NLP) transformers help to convert raw text to meaningful numeric values, ready to be ingested by a model. All transformations are applied only on the column in the dataset called <code>corpus</code>. Read more in the user guide.</p> <p>textcleanApply standard text cleaning to the corpus.textnormalizeNormalize the corpus.tokenizeTokenize the corpus.vectorizeVectorize the corpus.</p> <p></p> <p>method textclean(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, **kwargs)[source]Apply standard text cleaning to the corpus.</p> <p>Transformations include normalizing characters and drop noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised.</p> <p>See the TextCleaner class for a description of the parameters.</p> <p></p> <p>method textnormalize(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, **kwargs)[source]Normalize the corpus.</p> <p>Convert words to a more uniform standard. The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.</p> <p>See the TextNormalizer class for a description of the parameters.</p> <p></p> <p>method tokenize(bigram_freq=None, trigram_freq=None, quadgram_freq=None, **kwargs)[source]Tokenize the corpus.</p> <p>Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>See the Tokenizer class for a description of the parameters.</p> <p></p> <p>method vectorize(strategy=\"bow\", return_sparse=True, **kwargs)[source]Vectorize the corpus.</p> <p>Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix <code>corpus_</code>. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.</p> <p>See the Vectorizer class for a description of the parameters.</p> <p></p> <p></p>"}, {"location": "API/ATOM/atomregressor/#feature-engineering", "title": "Feature engineering", "text": "<p>To further pre-process the data, it's possible to extract features from datetime columns, create new non-linear features transforming the existing ones, group similar features or, if the dataset is too large, remove features. Read more in the user guide.</p> <p>feature_extractionExtract features from datetime columns.feature_generationGenerate new features.feature_groupingExtract statistics from similar features.feature_selectionReduce the number of features in the data.</p> <p></p> <p>method feature_extraction(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, **kwargs)[source]Extract features from datetime columns.</p> <p>Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype <code>datetime64</code> are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.</p> <p>See the FeatureExtractor class for a description of the parameters.</p> <p></p> <p>method feature_generation(strategy=\"dfs\", n_features=None, operators=None, **kwargs)[source]Generate new features.</p> <p>Create new combinations of existing features to capture the non-linear relations between the original features.</p> <p>See the FeatureGenerator class for a description of the parameters.</p> <p></p> <p>method feature_grouping(groups, operators=None, drop_columns=True, **kwargs)[source]Extract statistics from similar features.</p> <p>Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the <code>groups</code> method.</p> <p>See the FeatureGrouper class for a description of the parameters.</p> <p>Tip</p> <p>Use a regex pattern with the <code>groups</code> parameter to select groups easier, e.g., <code>atom.feature_grouping({\"group1\": \"var_.+\")</code> to select all features that start with <code>var_</code>.</p> <p></p> <p>method feature_selection(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, **kwargs)[source]Reduce the number of features in the data.</p> <p>Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.</p> <p>See the FeatureSelector class for a description of the parameters.</p> <p>Note</p> <ul> <li>When strategy=\"univariate\" and solver=None, f_classif   or f_regression is used as default solver.</li> <li>When strategy is \"sfs\", \"rfecv\" or any of the   advanced strategies and no scoring is specified,   atom's metric (if it exists) is used as scoring.</li> </ul> <p></p> <p></p>"}, {"location": "API/ATOM/atomregressor/#training", "title": "Training", "text": "<p>The training methods are where the models are fitted to the data and their performance is evaluated against a selected metric. There are three methods to call the three different training approaches. Read more in the user guide.</p> <p>runTrain and evaluate the models in a direct fashion.successive_halvingFit the models in a successive halving fashion.train_sizingTrain and evaluate the models in a train sizing fashion.</p> <p></p> <p>method run(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a direct fashion.</p> <p>Contrary to successive_halving and train_sizing, the direct approach only iterates once over the models, using the full dataset.</p> <p>The following steps are applied to every model:</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the DirectClassifier or DirectRegressor class for a description of the parameters.</p> <p></p> <p>method successive_halving(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Fit the models in a successive halving fashion.</p> <p>The successive halving technique is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, it is recommended to only use this technique with similar models, e.g., only using tree-based models.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the SuccessiveHalvingClassifier or SuccessiveHalvingRegressor class for a description of the parameters.</p> <p></p> <p>method train_sizing(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", **kwargs)[source]Train and evaluate the models in a train sizing fashion.</p> <p>When training models, there is usually a trade-off between model performance and computation time; that is regulated by the number of samples in the training set. This method can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test    set (optional).</li> </ol> <p>See the TrainSizingClassifier or TrainSizingRegressor class for a description of the parameters.</p> <p></p>"}, {"location": "API/branch/branch/", "title": "Branch", "text": "<p>class atom.branch.branch.Branch(name, memory=None, data=None, holdout=None)[source]Object that contains the data.</p> <p>A branch contains a specific pipeline, the dataset transformed through that pipeline, the models fitted on that dataset, and all data and utility attributes that refer to that dataset. Branches can be created and accessed through atom's <code>branch</code> attribute.</p> <p>All public properties and attributes of the branch can be accessed from the parent.</p> <p>Read more in the user guide.</p> <p>Warning</p> <p>This class should not be called directly. Branches are created internally by the ATOMClassifier, ATOMForecaster and ATOMRegressor classes.</p> <p>Parametersname: str Name of the branch. <p>memory: str, Memory or None, default=None Memory object for pipeline caching and to store the data when the branch is inactive. <p>data: DataContainer or None, default=None Data for the branch. <p>holdout: dataframe or None, default=None Holdout data set. <p></p> <p></p> <p>See Also</p> <p>BranchManager Object that manages branches.</p> <p></p>"}, {"location": "API/branch/branch/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Initialize atom\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 177 (1.3%)\n\n\n\n&gt;&gt;&gt; # Train a model\n&gt;&gt;&gt; atom.run(\"RF\")\n\n\nTraining ========================= &gt;&gt;\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9517\nTime elapsed: 0.236s\n-------------------------------------------------\nTime: 0.236s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.239s\n-------------------------------------\nRandomForest --&gt; f1: 0.9517\n\n\n&gt;&gt;&gt; # Change the branch and apply feature scaling\n&gt;&gt;&gt; atom.branch = \"scaled\"\n\nSuccessfully created new branch: scaled.\n\n\n&gt;&gt;&gt; atom.scale()\n\nFitting Scaler...\nScaling features...\n\n&gt;&gt;&gt; atom.run(\"RF_scaled\")\n\n\nTraining ========================= &gt;&gt;\nModels: RF_scaled\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9517\nTime elapsed: 0.237s\n-------------------------------------------------\nTime: 0.237s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.240s\n-------------------------------------\nRandomForest --&gt; f1: 0.9517\n\n\n&gt;&gt;&gt; # Compare the models\n&gt;&gt;&gt; atom.plot_roc()\n</code></pre>"}, {"location": "API/branch/branch/#attributes", "title": "Attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/branch/branch/#methods", "title": "Methods", "text": "<p>loadLoad the branch's data from memory.storeStore the branch's data as a pickle in memory.</p> <p></p> <p>method load(assign=True)[source]Load the branch's data from memory.</p> <p>This method is used to restore the data of inactive branches.</p> <p>Parametersassign: bool, default=True Whether to assign the loaded data to <code>self</code>. <p>ReturnsDataContainer or None Own data information. Returns None if no data is set. </p> <p></p> <p>method store(assign=True)[source]Store the branch's data as a pickle in memory.</p> <p>After storage, the data is deleted, and the branch is no longer usable until load is called. This method is used to store the data for inactive branches.</p> <p>Note</p> <p>This method is skipped silently for branches with no memory allocation.</p> <p>Parametersassign: bool, default=True Whether to assign <code>None</code> to the data in <code>self</code>. </p> <p></p>"}, {"location": "API/branch/branchmanager/", "title": "BranchManager", "text": "<p>class atom.branch.branchmanager.BranchManager(memory=None)[source]Object that manages branches.</p> <p>Maintains references to a series of branches and the current active branch. Additionally, always stores an 'original' branch containing the original dataset (previous to any transformations). The branches share a reference to a holdout set, not the instance self. When a memory object is specified, it stores inactive branches in memory.</p> <p>Read more in the user guide.</p> <p>Warning</p> <p>This class should not be called directly. The BranchManager is created internally by the ATOMClassifier, ATOMForecaster and ATOMRegressor classes.</p> <p>Parametersmemory: str, Memory or None, default=None Location to store inactive branches. If None, all branches are kept in memory. This memory object is passed to the branches for pipeline caching. <p>Attributesbranches: ClassMap Collection of branches. <p>og: Branch Branch containing the original dataset. It can be any branch in <code>branches</code> or an internally made branch called <code>og</code>. <p>current: Branch Current active branch. <p></p> <p></p> <p>See Also</p> <p>Branch Object that contains the data.</p> <p></p>"}, {"location": "API/branch/branchmanager/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Initialize atom\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 174 (1.2%)\n\n\n\n&gt;&gt;&gt; # Train a model\n&gt;&gt;&gt; atom.run(\"RF\")\n\n\nTraining ========================= &gt;&gt;\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9655\nTime elapsed: 0.229s\n-------------------------------------------------\nTime: 0.229s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.232s\n-------------------------------------\nRandomForest --&gt; f1: 0.9655\n\n\n&gt;&gt;&gt; # Change the branch and apply feature scaling\n&gt;&gt;&gt; atom.branch = \"scaled\"\n\nSuccessfully created new branch: scaled.\n\n\n&gt;&gt;&gt; atom.scale()\n\nFitting Scaler...\nScaling features...\n\n&gt;&gt;&gt; atom.run(\"RF_scaled\")\n\n\nTraining ========================= &gt;&gt;\nModels: RF_scaled\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9722\nTime elapsed: 0.228s\n-------------------------------------------------\nTime: 0.228s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.231s\n-------------------------------------\nRandomForest --&gt; f1: 0.9722\n\n\n&gt;&gt;&gt; # Compare the models\n&gt;&gt;&gt; atom.plot_roc()\n</code></pre>"}, {"location": "API/branch/branchmanager/#attributes", "title": "Attributes", "text": "<p>Attributesbranches: ClassMap Collection of branches. <p>og: Branch Branch containing the original dataset. It can be any branch in <code>branches</code> or an internally made branch called <code>og</code>. <p>current: Branch Current active branch. <p></p> <p></p>"}, {"location": "API/branch/branchmanager/#methods", "title": "Methods", "text": "<p>addAdd a new branch to the manager.fillFill the current branch with data.resetReset this instance to its initial state.</p> <p></p> <p>method add(name, parent=None)[source]Add a new branch to the manager.</p> <p>If the branch is called <code>og</code> (reserved name for the original branch), it's created separately and stored in memory.</p> <p>Parametersname: str Name for the new branch. <p>parent: Branch or None, default=None Parent branch. Data and attributes from the parent are passed to the new branch. </p> <p></p> <p>method fill(data, holdout=None)[source]Fill the current branch with data.</p> <p>Parametersdata: DataContainer New data for the current branch. <p>holdout: dataframe or None, default=None Holdout data set (if any). </p> <p></p> <p>method reset(hard=False)[source]Reset this instance to its initial state.</p> <p>The initial state of the BranchManager contains a single branch called <code>main</code> with no data. There's no reference to an original (<code>og</code>) branch.</p> <p>Parametershard: bool, default=False If True, flushes completely the cache. </p> <p></p>"}, {"location": "API/data_cleaning/balancer/", "title": "Balancer", "text": "<p>class atom.data_cleaning.Balancer(strategy=\"ADASYN\", n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs)[source]Balance the number of samples per class in the target column.</p> <p>When oversampling, the newly created samples have an increasing integer index for numerical indices, and an index of the form [estimator]_N for non-numerical indices, where N stands for the N-th sample in the data set. Use only for classification tasks.</p> <p>This class can be accessed from atom through the balance method. Read more in the user guide.</p> <p>Warning</p> <ul> <li>The clustercentroids estimator is unavailable because of    incompatibilities of the APIs.</li> <li>The Balancer class does not support multioutput tasks.</li> </ul> <p>Parametersstrategy: str or estimator, default=\"ADASYN\" Type of algorithm with which to balance the dataset. Choose from the name of any estimator in the imbalanced-learn package or provide a custom instance of such. <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 - value.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p>**kwargs Additional keyword arguments for the <code>strategy</code> estimator. <p>Attributes[strategy]_: imblearn estimator Object (lowercase strategy) used to balance the data, e.g., <code>balancer.adasyn_</code> for the default strategy. <p>mapping_: dict Target values mapped to their respective encoded integers. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>target_names_in_: np.ndarray Names of the target column seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Encoder Perform encoding of categorical features.</p> <p>Imputer Handle missing values in the data.</p> <p>Pruner Prune outliers from the data.</p> <p></p>"}, {"location": "API/data_cleaning/balancer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.train)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630             0.054390         0.1720  ...           107.30       740.4            0.1610            0.42250          0.50300               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690             0.094510         0.1860  ...           142.20      1493.0            0.1492            0.25360          0.37590               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699             0.047440         0.1538  ...           135.10      1320.0            0.1315            0.18060          0.20800               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686             0.027390         0.1852  ...           110.10       931.4            0.1148            0.09866          0.15470               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263             0.023080         0.1305  ...            63.34       270.0            0.1179            0.18790          0.15440               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n451        19.73         19.82          130.70     1206.0          0.10620           0.18490         0.24170             0.097400         0.1733  ...           159.80      1933.0            0.1710            0.59550          0.84890               0.25070          0.2749                  0.12970       0\n452        12.72         13.78           81.78      492.1          0.09667           0.08393         0.01288             0.019240         0.1638  ...            88.54       553.7            0.1298            0.14720          0.05233               0.06343          0.2369                  0.06922       1\n453        11.51         23.93           74.52      403.5          0.09261           0.10210         0.11120             0.041050         0.1388  ...            82.28       474.2            0.1298            0.25170          0.36300               0.09653          0.2112                  0.08732       1\n454        10.75         14.97           68.26      355.3          0.07793           0.05139         0.02251             0.007875         0.1399  ...            77.79       441.2            0.1076            0.12230          0.09755               0.03413          0.2300                  0.06769       1\n455        25.22         24.91          171.50     1878.0          0.10630           0.26650         0.33390             0.184500         0.1829  ...           211.70      2562.0            0.1573            0.60760          0.64760               0.28670          0.2355                  0.10510       0\n\n[456 rows x 31 columns]\n\n\n&gt;&gt;&gt; atom.balance(strategy=\"smote\", verbose=2)\n\nOversampling with SMOTE...\n --&gt; Adding 116 samples to class 0.\n\n\n&gt;&gt;&gt; # Note that the number of rows has increased\n&gt;&gt;&gt; print(atom.train)\n\n     mean radius  mean texture  mean perimeter    mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter   worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0      13.480000     20.820000       88.400000   559.200000         0.101600          0.125500        0.106300             0.054390       0.172000  ...       107.300000   740.400000          0.161000           0.422500         0.503000              0.225800        0.280700                 0.107100       0\n1      18.310000     20.580000      120.800000  1052.000000         0.106800          0.124800        0.156900             0.094510       0.186000  ...       142.200000  1493.000000          0.149200           0.253600         0.375900              0.151000        0.307400                 0.078630       0\n2      17.930000     24.480000      115.200000   998.900000         0.088550          0.070270        0.056990             0.047440       0.153800  ...       135.100000  1320.000000          0.131500           0.180600         0.208000              0.113600        0.250400                 0.079480       0\n3      15.130000     29.810000       96.710000   719.500000         0.083200          0.046050        0.046860             0.027390       0.185200  ...       110.100000   931.400000          0.114800           0.098660         0.154700              0.065750        0.323300                 0.061650       0\n4       8.950000     15.760000       58.740000   245.200000         0.094620          0.124300        0.092630             0.023080       0.130500  ...        63.340000   270.000000          0.117900           0.187900         0.154400              0.038460        0.165200                 0.077220       1\n..           ...           ...             ...          ...              ...               ...             ...                  ...            ...  ...              ...          ...               ...                ...              ...                   ...             ...                      ...     ...\n567    15.182945     22.486774       98.949465   711.386079         0.092513          0.102732        0.113923             0.069481       0.179224  ...       107.689157   826.276172          0.126730           0.199259         0.295172              0.142325        0.265352                 0.068318       0\n568    19.990378     20.622944      130.491182  1253.735467         0.091583          0.117753        0.117236             0.082771       0.202428  ...       167.456689  1995.896044          0.132457           0.289652         0.332006              0.182989        0.299088                 0.084150       0\n569    18.158121     18.928220      119.907435  1027.331092         0.113149          0.147089        0.171862             0.103942       0.209306  ...       135.286302  1319.270051          0.127029           0.233493         0.260138              0.133851        0.302406                 0.079535       0\n570    23.733233     26.433751      158.185672  1724.145541         0.098008          0.193789        0.231158             0.139527       0.188817  ...       207.483796  2844.559632          0.150495           0.463361         0.599077              0.266433        0.290828                 0.091542       0\n571    17.669575     16.375717      115.468589   968.552411         0.093636          0.109983        0.101005             0.075283       0.174505  ...       133.767576  1227.195245          0.118221           0.264624         0.249798              0.135098        0.268044                 0.076533       0\n\n[572 rows x 31 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Balancer\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...          17.33           184.60      2019.0           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...          23.41           158.80      1956.0           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...          25.53           152.50      1709.0           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...          26.50            98.87       567.7           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...          16.67           152.20      1575.0           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...          26.40           166.10      2027.0           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...          38.25           155.00      1731.0           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...          34.12           126.70      1124.0           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...          39.42           184.60      1821.0           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...          30.37            59.16       268.6           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039\n\n[569 rows x 30 columns]\n\n\n&gt;&gt;&gt; balancer = Balancer(strategy=\"smote\", verbose=2)\n&gt;&gt;&gt; X, y = balancer.fit_transform(X, y)\n\nOversampling with SMOTE...\n --&gt; Adding 145 samples to class 0.\n\n\n&gt;&gt;&gt; # Note that the number of rows has increased\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter    mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter   worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0      17.990000     10.380000      122.800000  1001.000000         0.118400          0.277600        0.300100             0.147100       0.241900  ...      17.330000       184.600000  2019.000000          0.162200           0.665600         0.711900              0.265400        0.460100                 0.118900\n1      20.570000     17.770000      132.900000  1326.000000         0.084740          0.078640        0.086900             0.070170       0.181200  ...      23.410000       158.800000  1956.000000          0.123800           0.186600         0.241600              0.186000        0.275000                 0.089020\n2      19.690000     21.250000      130.000000  1203.000000         0.109600          0.159900        0.197400             0.127900       0.206900  ...      25.530000       152.500000  1709.000000          0.144400           0.424500         0.450400              0.243000        0.361300                 0.087580\n3      11.420000     20.380000       77.580000   386.100000         0.142500          0.283900        0.241400             0.105200       0.259700  ...      26.500000        98.870000   567.700000          0.209800           0.866300         0.686900              0.257500        0.663800                 0.173000\n4      20.290000     14.340000      135.100000  1297.000000         0.100300          0.132800        0.198000             0.104300       0.180900  ...      16.670000       152.200000  1575.000000          0.137400           0.205000         0.400000              0.162500        0.236400                 0.076780\n..           ...           ...             ...          ...              ...               ...             ...                  ...            ...  ...            ...              ...          ...               ...                ...              ...                   ...             ...                      ...\n709    19.478557     23.348123      128.995257  1164.950583         0.101810          0.143231        0.194792             0.095794       0.198376  ...      30.482866       143.381227  1362.533650          0.135197           0.267786         0.365230              0.170069        0.273984                 0.076077\n710    18.752895     20.824323      124.472875  1084.317645         0.096491          0.171270        0.177021             0.095356       0.204866  ...      27.544127       160.451305  1623.116663          0.133721           0.506298         0.521417              0.203921        0.348906                 0.098688\n711    17.182368     21.204540      112.271609   925.918840         0.100517          0.110961        0.110803             0.076692       0.204604  ...      28.119577       142.316398  1439.815962          0.155602           0.277795         0.388351              0.207039        0.334574                 0.080310\n712    18.285452     20.578363      120.603613  1048.317740         0.106252          0.125135        0.153635             0.093128       0.188095  ...      26.188544       142.298194  1487.517523          0.147703           0.251890         0.365958              0.150828        0.308848                 0.078435\n713    14.550791     25.918705       96.913441   655.023273         0.111607          0.166865        0.158127             0.077468       0.228924  ...      36.072516       123.641397   930.709825          0.163673           0.659480         0.662486              0.197880        0.423041                 0.132320\n\n[714 rows x 30 columns]\n</code></pre>"}, {"location": "API/data_cleaning/balancer/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformBalance the data.</p> <p></p> <p>method fit(X, y=-1)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, dict or sequence, default=-1 Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=-1)[source]Balance the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str or sequence, default=-1 Target column corresponding to `X`. <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>Else: Array with shape=(n_samples,) to use as target.</li> </ul> <p>Returnsdataframe Balanced dataframe. <p>series Transformed target column. </p> <p></p>"}, {"location": "API/data_cleaning/cleaner/", "title": "Cleaner", "text": "<p>class atom.data_cleaning.Cleaner(convert_dtypes=True, drop_dtypes=None, drop_chars=None, strip_categorical=True, drop_duplicates=False, drop_missing_target=True, encode_target=True, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None)[source]Applies standard data cleaning steps on a dataset.</p> <p>Use the parameters to choose which transformations to perform. The available steps are:</p> <ul> <li>Convert dtypes to the best possible types.</li> <li>Drop columns with specific data types.</li> <li>Remove characters from column names.</li> <li>Strip categorical features from spaces.</li> <li>Drop duplicate rows.</li> <li>Drop rows with missing values in the target column.</li> <li>Encode the target column.</li> </ul> <p>This class can be accessed from atom through the clean method. Read more in the user guide.</p> <p>Parametersconvert_dtypes: bool, default=True Convert the column's data types to the best possible types that support <code>pd.NA</code>. <p>drop_dtypes: str, sequence or None, default=None Columns with these data types are dropped from the dataset. <p>drop_chars: str or None, default=None Remove the specified regex pattern from column names, e.g. <code>[^A-Za-z0-9]+</code> to remove all non-alphanumerical characters. <p>strip_categorical: bool, default=True Whether to strip spaces from categorical columns. <p>drop_duplicates: bool, default=False Whether to drop duplicate rows. Only the first occurrence of every duplicated row is kept. <p>drop_missing_target: bool, default=True Whether to drop rows with missing values in the target column. This transformation is ignored if <code>y</code> is not provided. <p>encode_target: bool, default=True Whether to encode the target column(s). This includes converting categorical columns to numerical, and binarizing multilabel columns. This transformation is ignored if <code>y</code> is not provided. <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>Attributesmissing_: list Values that are considered \"missing\". Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. <p>mapping_: dict Target values mapped to their respective encoded integers. Only available if encode_target=True. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>target_names_in_: np.ndarray Names of the target column(s) seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Encoder Perform encoding of categorical features.</p> <p>Discretizer Bin continuous data into intervals.</p> <p>Scaler Scale the data.</p> <p></p>"}, {"location": "API/data_cleaning/cleaner/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n&gt;&gt;&gt; y = [\"a\" if i else \"b\" for i in y]\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.y)\n\n0      a\n1      a\n2      a\n3      a\n4      a\n      ..\n564    a\n565    a\n566    a\n567    a\n568    b\nName: target, Length: 569, dtype: object\n\n\n&gt;&gt;&gt; atom.clean(verbose=2)\n\nFitting Cleaner...\nCleaning the data...\n --&gt; Label-encoding column target.\n\n\n&gt;&gt;&gt; print(atom.y)\n\n0      0\n1      0\n2      0\n3      0\n4      0\n      ..\n564    0\n565    0\n566    0\n567    0\n568    1\nName: target, Length: 569, dtype: Int64\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Cleaner\n&gt;&gt;&gt; from numpy.random import randint\n\n&gt;&gt;&gt; y = [\"a\" if i else \"b\" for i in range(randint(100))]\n\n&gt;&gt;&gt; cleaner = Cleaner(verbose=2)\n&gt;&gt;&gt; y = cleaner.fit_transform(y=y)\n\nFitting Cleaner...\nCleaning the data...\n --&gt; Label-encoding column target.\n\n\n&gt;&gt;&gt; print(y)\n\n0     1\n1     0\n2     0\n3     0\n4     0\n5     0\n6     0\n7     0\n8     0\n9     0\n10    0\n11    0\n12    0\n13    0\n14    0\n15    0\n16    0\n17    0\n18    0\n19    0\n20    0\n21    0\n22    0\n23    0\n24    0\n25    0\n26    0\n27    0\n28    0\n29    0\n30    0\n31    0\n32    0\n33    0\n34    0\n35    0\n36    0\nName: target, dtype: Int64\n</code></pre>"}, {"location": "API/data_cleaning/cleaner/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformInversely transform the label encoding.set_paramsSet the parameters of this estimator.transformApply the data cleaning steps to the data.</p> <p></p> <p>method fit(X=None, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Inversely transform the label encoding.</p> <p>This method only inversely transforms the target encoding. The rest of the transformations can't be inverted. If <code>encode_target=False</code>, the data is returned as is.</p> <p>ParametersX: dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsdataframe Unchanged feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X=None, y=None)[source]Apply the data cleaning steps to the data.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series Transformed target column. Only returned if provided. </p> <p></p>"}, {"location": "API/data_cleaning/discretizer/", "title": "Discretizer", "text": "<p>class atom.data_cleaning.Discretizer(strategy=\"quantile\", bins=5, labels=None, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None)[source]Bin continuous data into intervals.</p> <p>For each feature, the bin edges are computed during fit and, together with the number of bins, they define the intervals. Ignores categorical columns.</p> <p>This class can be accessed from atom through the discretize method. Read more in the user guide.</p> <p>Tip</p> <p>The transformation returns categorical columns. Use the Encoder class to convert them back to numerical types.</p> <p>Parametersstrategy: str, default=\"quantile\" Strategy used to define the widths of the bins. Choose from: <ul> <li>\"uniform\": All bins have identical widths.</li> <li>\"quantile\": All bins have the same number of points.</li> <li>\"kmeans\": Values in each bin have the same nearest center of   a 1D k-means cluster.</li> <li>\"custom\": Use custom bin edges provided through <code>bins</code>.</li> </ul> <p>bins: int, sequence or dict, default=5 Bin number or bin edges in which to split every column. <ul> <li>If int: Number of bins to produce for all columns. Only for   strategy!=\"custom\".</li> <li> <p>If sequence:</p> <ul> <li>For strategy!=\"custom\": Number of bins per column. The   n-th value corresponds to the n-th column that is   transformed. Categorical columns are ignored.</li> <li>For strategy=\"custom\": Bin edges with length=n_bins - 1.   The outermost edges are always <code>-inf</code> and <code>+inf</code>, e.g.,   bins <code>[1, 2]</code> indicate <code>(-inf, 1], (1, 2], (2, inf]</code>.</li> </ul> </li> <li> <p>If dict: One of the aforementioned options per column, where   the key is the column's name. Columns that are not in the   dictionary are not transformed.</p> </li> </ul> <p>labels: sequence, dict or None, default=None Label names with which to replace the binned intervals. <ul> <li>If None: Use default labels of the form <code>(min_edge, max_edge]</code>.</li> <li>If sequence: Labels to use for all columns.</li> <li>If dict: Labels per column, where the key is the column's name.   Columns that are not in the dictionary use the default labels.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. Only for strategy=\"quantile\". <p>Attributesfeature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Encoder Perform encoding of categorical features.</p> <p>Imputer Handle missing values in the data.</p> <p>Normalizer Transform the data to follow a Normal/Gaussian distribution.</p> <p></p>"}, {"location": "API/data_cleaning/discretizer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom[\"mean radius\"])\n\n0      13.48\n1      18.31\n2      17.93\n3      15.13\n4       8.95\n       ...  \n564    14.34\n565    13.17\n566    17.30\n567    17.68\n568    14.80\nName: mean radius, Length: 569, dtype: float64\n\n\n&gt;&gt;&gt; atom.discretize(\n...     strategy=\"custom\",\n...     bins=[13, 18],\n...     labels=[\"small\", \"medium\", \"large\"],\n...     verbose=2,\n...     columns=\"mean radius\",\n... )\n\nFitting Discretizer...\nBinning the features...\n --&gt; Discretizing feature mean radius in 3 bins.\n\n\n&gt;&gt;&gt; print(atom[\"mean radius\"])\n\n0      medium\n1       large\n2      medium\n3      medium\n4       small\n        ...  \n564    medium\n565    medium\n566    medium\n567    medium\n568    medium\nName: mean radius, Length: 569, dtype: category\nCategories (3, object): ['small' &lt; 'medium' &lt; 'large']\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Discretizer\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n&gt;&gt;&gt; print(X[\"mean radius\"])\n\n0      17.99\n1      20.57\n2      19.69\n3      11.42\n4      20.29\n       ...  \n564    21.56\n565    20.13\n566    16.60\n567    20.60\n568     7.76\nName: mean radius, Length: 569, dtype: float64\n\n\n&gt;&gt;&gt; discretizer = Discretizer(\n...     strategy=\"custom\",\n...     bins={\"mean radius\": [13, 18]},\n...     labels=[\"small\", \"medium\", \"large\"],\n...     verbose=2,\n... )\n&gt;&gt;&gt; X = discretizer.fit_transform(X)\n\nFitting Discretizer...\nBinning the features...\n --&gt; Discretizing feature mean radius in 3 bins.\n\n\n&gt;&gt;&gt; print(X[\"mean radius\"])\n\n0      medium\n1       large\n2       large\n3       small\n4       large\n        ...  \n564     large\n565     large\n566    medium\n567     large\n568     small\nName: mean radius, Length: 569, dtype: category\nCategories (3, object): ['small' &lt; 'medium' &lt; 'large']\n</code></pre>"}, {"location": "API/data_cleaning/discretizer/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformBin the data into intervals.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Bin the data into intervals.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed feature set. </p> <p></p>"}, {"location": "API/data_cleaning/encoder/", "title": "Encoder", "text": "<p>class atom.data_cleaning.Encoder(strategy=\"Target\", max_onehot=10, ordinal=None, infrequent_to_value=None, value=\"infrequent\", n_jobs=1, verbose=0, logger=None, **kwargs)[source]Perform encoding of categorical features.</p> <p>The encoding type depends on the number of classes in the column:</p> <ul> <li>If n_classes=2 or ordinal feature, use Ordinal-encoding.</li> <li>If 2 &lt; n_classes &lt;= <code>max_onehot</code>, use OneHot-encoding.</li> <li>If n_classes &gt; <code>max_onehot</code>, use <code>strategy</code>-encoding.</li> </ul> <p>Missing values are propagated to the output column. Unknown classes encountered during transforming are imputed according to the selected strategy. Infrequent classes can be replaced with a value in order to prevent too high cardinality.</p> <p>This class can be accessed from atom through the encode method. Read more in the user guide.</p> <p>Warning</p> <p>Three category-encoders estimators are unavailable:</p> <ul> <li>OneHotEncoder: Use the max_onehot parameter.</li> <li>HashingEncoder: Incompatibility of APIs.</li> <li>LeaveOneOutEncoder: Incompatibility of APIs.</li> </ul> <p>Parametersstrategy: str or estimator, default=\"Target\" Type of encoding to use for high cardinality features. Choose from any of the estimators in the category-encoders package or provide a custom one. <p>max_onehot: int or None, default=10 Maximum number of unique values in a feature to perform one-hot encoding. If None, <code>strategy</code>-encoding is always used for columns with more than two classes. <p>ordinal: dict or None, default=None Order of ordinal features, where the dict key is the feature's name and the value is the class order, e.g., <code>{\"salary\": [\"low\", \"medium\", \"high\"]}</code>. <p>infrequent_to_value: int, float or None, default=None Replaces infrequent class occurrences in categorical columns with the string in parameter <code>value</code>. This transformation is done before the encoding of the column. <ul> <li>If None: Skip this step.</li> <li>If int: Minimum number of occurrences in a class.</li> <li>If float: Minimum fraction of occurrences in a class.</li> </ul> <p>value: str, default=\"infrequent\" Value with which to replace rare classes. This parameter is ignored if <code>infrequent_to_value=None</code>. <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 - value.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>**kwargs Additional keyword arguments for the <code>strategy</code> estimator. <p>Attributesmapping_: dict of dicts Encoded values and their respective mapping. The column name is the key to its mapping dictionary. Only for ordinal encoding. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Cleaner Applies standard data cleaning steps on a dataset.</p> <p>Imputer Handle missing values in the data.</p> <p>Pruner Prune outliers from the data.</p> <p></p>"}, {"location": "API/data_cleaning/encoder/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n&gt;&gt;&gt; from numpy.random import randint\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n&gt;&gt;&gt; X[\"cat_feature_1\"] = [f\"x{i}\" for i in randint(0, 2, len(X))]\n&gt;&gt;&gt; X[\"cat_feature_2\"] = [f\"x{i}\" for i in randint(0, 3, len(X))]\n&gt;&gt;&gt; X[\"cat_feature_3\"] = [f\"x{i}\" for i in randint(0, 20, len(X))]\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2  cat_feature_3\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710             x0             x1            x17\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863             x0             x0            x15\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948             x1             x0            x16\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165             x0             x0            x13\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722             x0             x1            x11\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...               ...                ...              ...                   ...             ...                      ...            ...            ...            ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072             x0             x2            x11\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618             x1             x1             x5\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113             x0             x1            x17\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738             x0             x0             x2\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285             x0             x2            x14\n\n[569 rows x 33 columns]\n\n\n&gt;&gt;&gt; atom.encode(strategy=\"target\", max_onehot=10, verbose=2)\n\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Ordinal-encoding feature cat_feature_1. Contains 2 classes.\n --&gt; OneHot-encoding feature cat_feature_2. Contains 3 classes.\n --&gt; Target-encoding feature cat_feature_3. Contains 20 classes.\n\n\n&gt;&gt;&gt; # Note the one-hot encoded column with name [feature]_[class]\n&gt;&gt;&gt; print(atom.X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2_x1  cat_feature_2_x0  cat_feature_2_x2  cat_feature_3\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           0.5030               0.22580          0.2807                  0.10710            0.0               1.0               0.0               0.0       0.622917\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           0.3759               0.15100          0.3074                  0.07863            0.0               0.0               1.0               0.0       0.619953\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           0.2080               0.11360          0.2504                  0.07948            1.0               0.0               1.0               0.0       0.636924\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           0.1547               0.06575          0.3233                  0.06165            0.0               0.0               1.0               0.0       0.585368\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...           0.1544               0.03846          0.1652                  0.07722            0.0               1.0               0.0               0.0       0.638596\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...                   ...             ...                      ...            ...               ...               ...               ...            ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           0.1632               0.10870          0.3062                  0.06072            0.0               0.0               0.0               1.0       0.638596\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           0.3728               0.16070          0.3693                  0.09618            1.0               1.0               0.0               0.0       0.588596\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           0.3378               0.18570          0.3138                  0.08113            0.0               1.0               0.0               0.0       0.622917\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           0.3583               0.15150          0.2463                  0.07738            0.0               0.0               1.0               0.0       0.688596\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           0.2060               0.08308          0.3600                  0.07285            0.0               0.0               0.0               1.0       0.662643\n\n[569 rows x 35 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Encoder\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n&gt;&gt;&gt; from numpy.random import randint\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n&gt;&gt;&gt; X[\"cat_feature_1\"] = [f\"x{i}\" for i in randint(0, 2, len(X))]\n&gt;&gt;&gt; X[\"cat_feature_2\"] = [f\"x{i}\" for i in randint(0, 3, len(X))]\n&gt;&gt;&gt; X[\"cat_feature_3\"] = [f\"x{i}\" for i in randint(0, 20, len(X))]\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2  cat_feature_3\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890             x1             x2             x5\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902             x1             x2            x13\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758             x0             x0            x15\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300             x0             x2            x10\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678             x1             x1            x17\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...               ...                ...              ...                   ...             ...                      ...            ...            ...            ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115             x1             x1            x12\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637             x0             x2            x14\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820             x0             x1             x3\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400             x1             x0             x2\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039             x1             x1            x11\n\n[569 rows x 33 columns]\n\n\n&gt;&gt;&gt; encoder = Encoder(strategy=\"target\", max_onehot=10, verbose=2)\n&gt;&gt;&gt; X = encoder.fit_transform(X, y)\n\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Ordinal-encoding feature cat_feature_1. Contains 2 classes.\n --&gt; OneHot-encoding feature cat_feature_2. Contains 3 classes.\n --&gt; Target-encoding feature cat_feature_3. Contains 20 classes.\n\n\n&gt;&gt;&gt; # Note the one-hot encoded column with name [feature]_[class]\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst concavity  worst concave points  worst symmetry  worst fractal dimension  cat_feature_1  cat_feature_2_x2  cat_feature_2_x0  cat_feature_2_x1  cat_feature_3\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...           0.7119                0.2654          0.4601                  0.11890            1.0               1.0               0.0               0.0       0.645086\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...           0.2416                0.1860          0.2750                  0.08902            1.0               1.0               0.0               0.0       0.604148\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...           0.4504                0.2430          0.3613                  0.08758            0.0               0.0               1.0               0.0       0.675079\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...           0.6869                0.2575          0.6638                  0.17300            0.0               1.0               0.0               0.0       0.706297\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...           0.4000                0.1625          0.2364                  0.07678            1.0               0.0               0.0               1.0       0.716566\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...                   ...             ...                      ...            ...               ...               ...               ...            ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...           0.4107                0.2216          0.2060                  0.07115            1.0               0.0               0.0               1.0       0.598024\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...           0.3215                0.1628          0.2572                  0.06637            0.0               1.0               0.0               0.0       0.683185\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...           0.3403                0.1418          0.2218                  0.07820            0.0               0.0               0.0               1.0       0.472908\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...           0.9387                0.2650          0.4087                  0.12400            1.0               0.0               1.0               0.0       0.585452\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...           0.0000                0.0000          0.2871                  0.07039            1.0               0.0               0.0               1.0       0.516759\n\n[569 rows x 35 columns]\n</code></pre>"}, {"location": "API/data_cleaning/encoder/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformEncode the data.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>Note that leaving y=None can lead to errors if the <code>strategy</code> encoder requires target values. For multioutput tasks, only the first target column is used to fit the encoder.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, dict, sequence or dataframe-like Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Encode the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Encoded dataframe. </p> <p></p>"}, {"location": "API/data_cleaning/imputer/", "title": "Imputer", "text": "<p>class atom.data_cleaning.Imputer(strat_num=\"drop\", strat_cat=\"drop\", max_nan_rows=None, max_nan_cols=None, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None)[source]Handle missing values in the data.</p> <p>Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. Use the <code>missing</code> attribute to customize what are considered \"missing values\".</p> <p>This class can be accessed from atom through the impute method. Read more in the user guide.</p> <p>Parametersstrat_num: str, int or float, default=\"drop\" Imputing strategy for numerical columns. Choose from: <ul> <li>\"drop\": Drop rows containing missing values.</li> <li>\"mean\": Impute with mean of column.</li> <li>\"median\": Impute with median of column.</li> <li>\"knn\": Impute using a K-Nearest Neighbors approach.</li> <li>\"iterative\": Impute using a multivariate imputer.</li> <li>\"most_frequent\": Impute with the most frequent value.</li> <li>int or float: Impute with provided numerical value.</li> </ul> <p>strat_cat: str, default=\"drop\" Imputing strategy for categorical columns. Choose from: <ul> <li>\"drop\": Drop rows containing missing values.</li> <li>\"most_frequent\": Impute with the most frequent value.</li> <li>str: Impute with provided string.</li> </ul> <p>max_nan_rows: int, float or None, default=None Maximum number or fraction of missing values in a row (if more, the row is removed). If None, ignore this step. <p>max_nan_cols: int, float or None, default=None Maximum number or fraction of missing values in a column (if more, the column is removed). If None, ignore this step. <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 - value.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. Only used when strat_num=\"iterative\". <p>Attributesmissing_: list Values that are considered \"missing\". Default values are: None, NaN, NA, NaT, +inf, -inf, \"\", \"?\", \"NA\", \"nan\", \"NaN\", \"NaT\", \"none\", \"None\", \"inf\", \"-inf\". Note that None, NaN, NA, +inf and -inf are always considered missing since they are incompatible with sklearn estimators. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Balancer Balance the number of samples per class in the target column.</p> <p>Discretizer Bin continuous data into intervals.</p> <p>Encoder Perform encoding of categorical features.</p> <p></p>"}, {"location": "API/data_cleaning/imputer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from numpy.random import randint\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Add some random missing values to the data\n&gt;&gt;&gt; for i, j in zip(randint(0, X.shape[0], 600), randint(0, 4, 600)):\n...     X.iat[i, j] = np.NaN\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.nans)\n\nmean radius                130\nmean texture               141\nmean perimeter             124\nmean area                  136\nmean smoothness              0\nmean compactness             0\nmean concavity               0\nmean concave points          0\nmean symmetry                0\nmean fractal dimension       0\nradius error                 0\ntexture error                0\nperimeter error              0\narea error                   0\nsmoothness error             0\ncompactness error            0\nconcavity error              0\nconcave points error         0\nsymmetry error               0\nfractal dimension error      0\nworst radius                 0\nworst texture                0\nworst perimeter              0\nworst area                   0\nworst smoothness             0\nworst compactness            0\nworst concavity              0\nworst concave points         0\nworst symmetry               0\nworst fractal dimension      0\ndtype: int64\n\n\n&gt;&gt;&gt; atom.impute(strat_num=\"median\", max_nan_rows=0.1, verbose=2)\n\nFitting Imputer...\nImputing missing values...\n --&gt; Imputing 130 missing values with median (13.27) in feature mean radius.\n --&gt; Imputing 141 missing values with median (18.87) in feature mean texture.\n --&gt; Imputing 124 missing values with median (85.66) in feature mean perimeter.\n --&gt; Imputing 136 missing values with median (555.1) in feature mean area.\n\n\n&gt;&gt;&gt; print(atom.n_nans)\n\n0\n</code></pre> <pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom.data_cleaning import Imputer\n&gt;&gt;&gt; from numpy.random import randint\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Add some random missing values to the data\n&gt;&gt;&gt; for i, j in zip(randint(0, X.shape[0], 600), randint(0, 4, 600)):\n...     X.iloc[i, j] = np.nan\n\n&gt;&gt;&gt; imputer = Imputer(strat_num=\"median\", max_nan_rows=0.1, verbose=2)\n&gt;&gt;&gt; X, y = imputer.fit_transform(X, y)\n\nFitting Imputer...\nImputing missing values...\n --&gt; Dropping 2 samples for containing more than 3 missing values.\n --&gt; Imputing 124 missing values with median (13.38) in feature mean radius.\n --&gt; Imputing 127 missing values with median (18.87) in feature mean texture.\n --&gt; Imputing 137 missing values with median (86.54) in feature mean perimeter.\n --&gt; Imputing 134 missing values with median (561.3) in feature mean area.\n\n\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0          13.38        10.380         122.800     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...          17.33           184.60      2019.0           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890\n1          20.57        17.770          86.545      561.3          0.08474           0.07864         0.08690              0.07017         0.1812  ...          23.41           158.80      1956.0           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902\n2          19.69        21.250         130.000     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...          25.53           152.50      1709.0           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758\n3          11.42        20.380          77.580      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...          26.50            98.87       567.7           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300\n4          13.38        14.340         135.100     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...          16.67           152.20      1575.0           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564        21.56        22.390          86.545      561.3          0.11100           0.11590         0.24390              0.13890         0.1726  ...          26.40           166.10      2027.0           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115\n565        20.13        18.865         131.200     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...          38.25           155.00      1731.0           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637\n566        13.38        28.080          86.545      561.3          0.08455           0.10230         0.09251              0.05302         0.1590  ...          34.12           126.70      1124.0           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820\n567        20.60        29.330         140.100     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...          39.42           184.60      1821.0           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400\n568        13.38        24.540          47.920      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...          30.37            59.16       268.6           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039\n\n[567 rows x 30 columns]\n</code></pre>"}, {"location": "API/data_cleaning/imputer/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformImpute the missing values.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Impute the missing values.</p> <p>Note that leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsdataframe Imputed dataframe. <p>series Transformed target column. Only returned if provided. </p> <p></p>"}, {"location": "API/data_cleaning/normalizer/", "title": "Normalizer", "text": "<p>class atom.data_cleaning.Normalizer(strategy=\"yeojohnson\", device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, random_state=None, **kwargs)[source]Transform the data to follow a Normal/Gaussian distribution.</p> <p>This transformation is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Missing values are disregarded in fit and maintained in transform. Categorical columns are ignored.</p> <p>This class can be accessed from atom through the normalize method. Read more in the user guide.</p> <p>Warning</p> <p>The quantile strategy performs a non-linear transformation. This may distort linear correlations between variables measured at the same scale but renders variables measured at different scales more directly comparable.</p> <p>Note</p> <p>The yeojohnson and boxcox strategies scale the data after transforming. Use the <code>kwargs</code> to change this behavior.</p> <p>Parametersstrategy: str, default=\"yeojohnson\" The transforming strategy. Choose from: <ul> <li>\"yeojohnson\"</li> <li>\"boxcox\" (only works with strictly positive values)</li> <li>\"quantile\": Transform features using quantiles information.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>random_state: int or None, default=None Seed used by the quantile strategy. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p>**kwargs Additional keyword arguments for the <code>strategy</code> estimator. <p>Attributes[strategy]_: sklearn transformer Object with which the data is transformed, e.g., <code>normalizer.yeojohnson</code> for the default strategy. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Cleaner Applies standard data cleaning steps on a dataset.</p> <p>Pruner Prune outliers from the data.</p> <p>Scaler Scale the data.</p> <p></p>"}, {"location": "API/data_cleaning/normalizer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            63.34       270.0            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[569 rows x 31 columns]\n\n\n&gt;&gt;&gt; atom.plot_distribution(columns=0)\n</code></pre> <pre><code>&gt;&gt;&gt; atom.normalize(verbose=2)\n\nFitting Normalizer...\nNormalizing features...\n\n\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0      -0.017068      0.464087        0.031104  -0.020222         0.390628          0.620790        0.562136             0.426774      -0.280554  ...         0.251532    0.081524          1.224389           1.206519         1.189835              1.522769       -0.043007                 1.378960       0\n1       1.182066      0.411242        1.183030   1.200556         0.741209          0.608244        1.100342             1.256472       0.256014  ...         1.119375    1.218096          0.759546           0.244492         0.726989              0.650523        0.424017                -0.164104       0\n2       1.105309      1.197684        1.018344   1.106437        -0.552214         -0.652544       -0.230044             0.226950      -1.050816  ...         0.973194    1.037232          0.002307          -0.374986        -0.128679              0.107299       -0.647198                -0.100126       0\n3       0.455144      2.077941        0.379512   0.486019        -0.966587         -1.447057       -0.438308            -0.480189       0.226570  ...         0.337722    0.483003         -0.785100          -1.301043        -0.483292             -0.722786        0.676588                -1.783846       0\n4      -1.898537     -0.815757       -1.745528  -1.873415        -0.102067          0.599235        0.374346            -0.662103      -2.173761  ...        -1.869111   -2.095123         -0.633206          -0.305478        -0.485431             -1.278472       -2.898859                -0.273347       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564     0.238929     -1.546154        0.209113   0.257899         0.214334         -0.482480       -0.225132             0.183841       0.996371  ...         0.346743    0.373205         -0.079012          -0.660736        -0.423384              0.029761        0.404215                -1.894769       1\n565    -0.115233      0.675396       -0.105672  -0.125511         0.078814          0.213069        0.222118             0.375009      -0.177404  ...         0.194134    0.082260          0.804177           1.061384         0.714032              0.778530        1.315113                 0.913117       0\n566     0.972621     -0.443853        0.950416   0.971288         0.335466          0.200161        0.804757             1.074782       0.080964  ...         0.880583    0.920102          0.443592           0.144776         0.561298              1.086695        0.527842                 0.020173       0\n567     1.053489      0.446545        1.084407   1.040647         1.046541          1.237987        1.321388             1.410770       0.650180  ...         0.925288    1.016604          0.452080           0.855688         0.652219              0.657243       -0.735710                -0.260751       0\n568     0.366875     -0.289945        0.346701   0.359700        -0.309357         -0.150999       -0.574459            -0.683107       0.375972  ...         0.207028    0.284140         -0.407994          -0.303600        -0.141124             -0.402554        1.196110                -0.638106       1\n\n[569 rows x 31 columns]\n\n\n&gt;&gt;&gt; atom.plot_distribution(columns=0)\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Normalizer\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; normalizer = Normalizer(verbose=2)\n&gt;&gt;&gt; X = normalizer.fit_transform(X)\n\nFitting Normalizer...\nNormalizing features...\n\n\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0       1.134881     -2.678666        1.259822   1.126421         1.504114          2.165938        1.862988             1.848558       1.953067  ...      -1.488367         1.810506    1.652210          1.282792           1.942737         1.730182              1.935654        2.197206                 1.723624\n1       1.619346     -0.264377        1.528723   1.633946        -0.820227         -0.384102        0.291976             0.820609       0.102291  ...      -0.288382         1.430616    1.610022         -0.325080          -0.296580         0.070746              1.101594       -0.121997                 0.537179\n2       1.464796      0.547806        1.454664   1.461645         0.963977          1.163977        1.403673             1.683104       0.985668  ...       0.071406         1.321941    1.425307          0.580301           1.209701         1.005512              1.722744        1.218181                 0.453955\n3      -0.759262      0.357721       -0.514886  -0.836238         2.781494          2.197843        1.642391             1.423004       2.360528  ...       0.228089        -0.039480   -0.436860          2.857821           2.282276         1.675087              1.862378        3.250202                 2.517606\n4       1.571260     -1.233520        1.583340   1.595120         0.343932          0.762392        1.407479             1.410929       0.090964  ...      -1.637882         1.316582    1.309486          0.284367          -0.131829         0.817474              0.807077       -0.943554                -0.279402\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564     1.781795      0.785604        1.746492   1.823030         1.052829          0.460810        1.653784             1.783067      -0.232645  ...       0.212151         1.547961    1.657442          0.438013          -0.077871         0.859079              1.503734       -1.721528                -0.751459\n565     1.543335      1.845150        1.485601   1.545430         0.168014          0.207602        0.984746             1.320730      -0.129120  ...       1.832201         1.365939    1.443167         -0.667317          -0.245277         0.480804              0.810995       -0.480093                -1.210527\n566     0.828589      1.817618        0.811329   0.835270        -0.835509          0.183969        0.375105             0.396882      -0.808189  ...       1.320625         0.786129    0.796192         -0.799337           0.626487         0.566826              0.526136       -1.301164                -0.170872\n567     1.624440      2.016299        1.702747   1.551036         1.468642          2.162820        1.994466             1.884414       1.899087  ...       1.968949         1.810506    1.513198          1.387135           2.284642         2.136932              1.931990        1.744693                 1.850944\n568    -2.699432      1.203224       -2.827766  -2.703256        -3.834325         -1.481409       -1.658319            -1.845392      -0.821560  ...       0.810681        -2.231436   -2.149403         -2.064647          -1.731936        -1.819966             -2.131070        0.103122                -0.820663\n\n[569 rows x 30 columns]\n</code></pre>"}, {"location": "API/data_cleaning/normalizer/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformApply the inverse transformation to the data.set_paramsSet the parameters of this estimator.transformApply the transformations to the data.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X, y=None)[source]Apply the inverse transformation to the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Original dataframe. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Apply the transformations to the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Normalized dataframe. </p> <p></p>"}, {"location": "API/data_cleaning/pruner/", "title": "Pruner", "text": "<p>class atom.data_cleaning.Pruner(strategy=\"zscore\", method=\"drop\", max_sigma=3, include_target=False, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]Prune outliers from the data.</p> <p>Replace or remove outliers. The definition of outlier depends on the selected strategy and can greatly differ from one another. Ignores categorical columns.</p> <p>This class can be accessed from atom through the prune method. Read more in the user guide.</p> <p>Info</p> <p>The \"sklearnex\" and \"cuml\" engines are only supported for strategy=\"dbscan\".</p> <p>Parametersstrategy: str or sequence, default=\"zscore\" Strategy with which to select the outliers. If sequence of strategies, only samples marked as outliers by all chosen strategies are dropped. Choose from: <ul> <li>\"zscore\": Z-score of each data value.</li> <li>\"iforest\": Isolation Forest.</li> <li>\"ee\": Elliptic Envelope.</li> <li>\"lof\": Local Outlier Factor.</li> <li>\"svm\": One-class SVM.</li> <li>\"dbscan\": Density-Based Spatial Clustering.</li> <li>\"hdbscan\": Hierarchical Density-Based Spatial Clustering.</li> <li>\"optics\": DBSCAN-like clustering approach.</li> </ul> <p>method: int, float or str, default=\"drop\" Method to apply on the outliers. Only the zscore strategy accepts another method than \"drop\". Choose from: <ul> <li>\"drop\": Drop any sample with outlier values.</li> <li>\"minmax\": Replace outlier with the min/max of the column.</li> <li>Any numerical value with which to replace the outliers.</li> </ul> <p>max_sigma: int or float, default=3 Maximum allowed standard deviations from the mean of the column. If more, it is considered an outlier. Only if strategy=\"zscore\". <p>include_target: bool, default=False Whether to include the target column in the search for outliers. This can be useful for regression tasks. Only if strategy=\"zscore\". <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>**kwargs Additional keyword arguments for the <code>strategy</code> estimator. If sequence of strategies, the params should be provided in a dict with the strategy's name as key. <p>Attributes[strategy]_: sklearn estimator Object used to prune the data, e.g., <code>pruner.iforest</code> for the isolation forest strategy. Not available for strategy=\"zscore\". <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Balancer Balance the number of samples per class in the target column.</p> <p>Normalizer Transform the data to follow a Normal/Gaussian distribution.</p> <p>Scaler Scale the data.</p> <p></p>"}, {"location": "API/data_cleaning/pruner/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            63.34       270.0            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[569 rows x 31 columns]\n\n\n&gt;&gt;&gt; atom.prune(stratgey=\"iforest\", verbose=2)\n\nFitting Pruner...\nPruning outliers...\n --&gt; Dropping 63 outliers.\n\n\n&gt;&gt;&gt; # Note the reduced number of rows\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4          10.26         16.58           65.85      320.8          0.08877           0.08066         0.04358              0.02438         0.1669  ...            71.08       357.4            0.1461            0.22460           0.1783               0.08333          0.2691                  0.09479       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n501        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n502        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n503        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n504        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n505        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[506 rows x 31 columns]\n\n\n&gt;&gt;&gt; atom.plot_distribution(columns=0)\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Normalizer\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; normalizer = Normalizer(verbose=2)\n&gt;&gt;&gt; X = normalizer.fit_transform(X)\n\nFitting Normalizer...\nNormalizing features...\n\n\n&gt;&gt;&gt; # Note the reduced number of rows\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0       1.134881     -2.678666        1.259822   1.126421         1.504114          2.165938        1.862988             1.848558       1.953067  ...      -1.488367         1.810506    1.652210          1.282792           1.942737         1.730182              1.935654        2.197206                 1.723624\n1       1.619346     -0.264377        1.528723   1.633946        -0.820227         -0.384102        0.291976             0.820609       0.102291  ...      -0.288382         1.430616    1.610022         -0.325080          -0.296580         0.070746              1.101594       -0.121997                 0.537179\n2       1.464796      0.547806        1.454664   1.461645         0.963977          1.163977        1.403673             1.683104       0.985668  ...       0.071406         1.321941    1.425307          0.580301           1.209701         1.005512              1.722744        1.218181                 0.453955\n3      -0.759262      0.357721       -0.514886  -0.836238         2.781494          2.197843        1.642391             1.423004       2.360528  ...       0.228089        -0.039480   -0.436860          2.857821           2.282276         1.675087              1.862378        3.250202                 2.517606\n4       1.571260     -1.233520        1.583340   1.595120         0.343932          0.762392        1.407479             1.410929       0.090964  ...      -1.637882         1.316582    1.309486          0.284367          -0.131829         0.817474              0.807077       -0.943554                -0.279402\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564     1.781795      0.785604        1.746492   1.823030         1.052829          0.460810        1.653784             1.783067      -0.232645  ...       0.212151         1.547961    1.657442          0.438013          -0.077871         0.859079              1.503734       -1.721528                -0.751459\n565     1.543335      1.845150        1.485601   1.545430         0.168014          0.207602        0.984746             1.320730      -0.129120  ...       1.832201         1.365939    1.443167         -0.667317          -0.245277         0.480804              0.810995       -0.480093                -1.210527\n566     0.828589      1.817618        0.811329   0.835270        -0.835509          0.183969        0.375105             0.396882      -0.808189  ...       1.320625         0.786129    0.796192         -0.799337           0.626487         0.566826              0.526136       -1.301164                -0.170872\n567     1.624440      2.016299        1.702747   1.551036         1.468642          2.162820        1.994466             1.884414       1.899087  ...       1.968949         1.810506    1.513198          1.387135           2.284642         2.136932              1.931990        1.744693                 1.850944\n568    -2.699432      1.203224       -2.827766  -2.703256        -3.834325         -1.481409       -1.658319            -1.845392      -0.821560  ...       0.810681        -2.231436   -2.149403         -2.064647          -1.731936        -1.819966             -2.131070        0.103122                -0.820663\n\n[569 rows x 30 columns]\n</code></pre>"}, {"location": "API/data_cleaning/pruner/#methods", "title": "Methods", "text": "<p>fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformApply the outlier strategy on the data.</p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Do nothing.</p> <p>Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Apply the outlier strategy on the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, dict, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsdataframe Transformed feature set. <p>series Transformed target column. Only returned if provided. </p> <p></p>"}, {"location": "API/data_cleaning/scaler/", "title": "Scaler", "text": "<p>class atom.data_cleaning.Scaler(strategy=\"standard\", include_binary=False, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]Scale the data.</p> <p>Apply one of sklearn's scalers. Categorical columns are ignored.</p> <p>This class can be accessed from atom through the scale method. Read more in the user guide.</p> <p>Parametersstrategy: str, default=\"standard\" Strategy with which to scale the data. Choose from: <ul> <li>\"standard\": Remove mean and scale to unit variance.</li> <li>\"minmax\": Scale features to a given range.</li> <li>\"maxabs\": Scale features by their maximum absolute value.</li> <li>\"robust\": Scale using statistics that are robust to outliers.</li> </ul> <p>include_binary: bool, default=False Whether to scale binary columns (only 0s and 1s). <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>**kwargs Additional keyword arguments for the <code>strategy</code> estimator. <p>Attributes[strategy]_: sklearn transformer Object with which the data is scaled, e.g., <code>scaler.standard</code> for the default strategy. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>Balancer Balance the number of samples per class in the target column.</p> <p>Normalizer Transform the data to follow a Normal/Gaussian distribution.</p> <p>Scaler Scale the data.</p> <p></p>"}, {"location": "API/data_cleaning/scaler/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0          13.48         20.82           88.40      559.2          0.10160           0.12550         0.10630              0.05439         0.1720  ...           107.30       740.4            0.1610            0.42250           0.5030               0.22580          0.2807                  0.10710       0\n1          18.31         20.58          120.80     1052.0          0.10680           0.12480         0.15690              0.09451         0.1860  ...           142.20      1493.0            0.1492            0.25360           0.3759               0.15100          0.3074                  0.07863       0\n2          17.93         24.48          115.20      998.9          0.08855           0.07027         0.05699              0.04744         0.1538  ...           135.10      1320.0            0.1315            0.18060           0.2080               0.11360          0.2504                  0.07948       0\n3          15.13         29.81           96.71      719.5          0.08320           0.04605         0.04686              0.02739         0.1852  ...           110.10       931.4            0.1148            0.09866           0.1547               0.06575          0.3233                  0.06165       0\n4           8.95         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...            63.34       270.0            0.1179            0.18790           0.1544               0.03846          0.1652                  0.07722       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564        14.34         13.47           92.51      641.2          0.09906           0.07624         0.05724              0.04603         0.2075  ...           110.40       873.2            0.1297            0.15250           0.1632               0.10870          0.3062                  0.06072       1\n565        13.17         21.81           85.42      531.5          0.09714           0.10470         0.08259              0.05252         0.1746  ...           105.50       740.7            0.1503            0.39040           0.3728               0.16070          0.3693                  0.09618       0\n566        17.30         17.08          113.00      928.2          0.10080           0.10410         0.12660              0.08353         0.1813  ...           130.90      1222.0            0.1416            0.24050           0.3378               0.18570          0.3138                  0.08113       0\n567        17.68         20.74          117.40      963.7          0.11150           0.16650         0.18550              0.10540         0.1971  ...           132.90      1302.0            0.1418            0.34980           0.3583               0.15150          0.2463                  0.07738       0\n568        14.80         17.66           95.88      674.8          0.09179           0.08890         0.04069              0.02260         0.1893  ...           105.90       829.5            0.1226            0.18810           0.2060               0.08308          0.3600                  0.07285       1\n\n[569 rows x 31 columns]\n\n\n&gt;&gt;&gt; atom.scale(verbose=2)\n\nFitting Scaler...\nScaling features...\n\n\n&gt;&gt;&gt; # Note the reduced number of rows\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  target\n0      -0.181875      0.356669       -0.147122  -0.270991         0.340268          0.381628        0.214571             0.125567      -0.345050  ...         0.000933   -0.246244          1.240292           1.077359         1.116229              1.667157       -0.162964                 1.326816       0\n1       1.162216      0.300578        1.159704   1.097856         0.707625          0.368288        0.852572             1.148598       0.172744  ...         1.025723    1.042996          0.719898          -0.011475         0.500961              0.537309        0.280594                -0.308640       0\n2       1.056470      1.212060        0.933833   0.950360        -0.581659         -0.670877       -0.407166            -0.051653      -1.018183  ...         0.817241    0.746639         -0.060694          -0.482078        -0.311813             -0.027615       -0.666328                -0.259812       0\n3       0.277287      2.457753        0.188054   0.174273        -0.959614         -1.132432       -0.534892            -0.562913       0.143156  ...         0.083151    0.080948         -0.797185          -1.010314        -0.569828             -0.750385        0.544735                -1.284055       0\n4      -1.442482     -0.825921       -1.343434  -1.143186        -0.152840          0.358760        0.042209            -0.672815      -1.879941  ...        -1.289891   -1.052061         -0.660471          -0.435018        -0.571280             -1.162598       -2.081728                -0.389638       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...     ...\n564     0.057446     -1.361124        0.018651  -0.043220         0.160827         -0.557108       -0.404013            -0.087607       0.967929  ...         0.091960   -0.018751         -0.140077          -0.663228        -0.528681             -0.101629        0.260659                -1.337478       1\n565    -0.268141      0.588045       -0.267318  -0.347933         0.025188         -0.014753       -0.084382             0.077883      -0.248889  ...        -0.051921   -0.245730          0.768409           0.870422         0.485954              0.683827        1.308918                 0.699518       0\n566     0.881154     -0.517419        0.845098   0.753978         0.283751         -0.026187        0.470528             0.868616      -0.001087  ...         0.693914    0.578760          0.384728          -0.095926         0.316526              1.061450        0.386915                -0.165028       0\n567     0.986900      0.337972        1.022568   0.852586         1.039660          1.162956        1.213182             1.426285       0.583281  ...         0.752641    0.715804          0.393548           0.608690         0.415763              0.544861       -0.734440                -0.380446       0\n568     0.185455     -0.381865        0.154577   0.050111        -0.352767         -0.315850       -0.612688            -0.685055       0.294796  ...        -0.040176   -0.093611         -0.453195          -0.433728        -0.321494             -0.488617        1.154420                -0.640672       1\n\n[569 rows x 31 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.data_cleaning import Scaler\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; scaler = Scaler(verbose=2)\n&gt;&gt;&gt; X = scaler.fit_transform(X)\n\nFitting Scaler...\nScaling features...\n\n\n&gt;&gt;&gt; # Note the reduced number of rows\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst texture  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension\n0       1.097064     -2.073335        1.269934   0.984375         1.568466          3.283515        2.652874             2.532475       2.217515  ...      -1.359293         2.303601    2.001237          1.307686           2.616665         2.109526              2.296076        2.750622                 1.937015\n1       1.829821     -0.353632        1.685955   1.908708        -0.826962         -0.487072       -0.023846             0.548144       0.001392  ...      -0.369203         1.535126    1.890489         -0.375612          -0.430444        -0.146749              1.087084       -0.243890                 0.281190\n2       1.579888      0.456187        1.566503   1.558884         0.942210          1.052926        1.363478             2.037231       0.939685  ...      -0.023974         1.347475    1.456285          0.527407           1.082932         0.854974              1.955000        1.152255                 0.201391\n3      -0.768909      0.253732       -0.592687  -0.764464         3.283553          3.402909        1.915897             1.451707       2.867383  ...       0.133984        -0.249939   -0.550021          3.394275           3.893397         1.989588              2.175786        6.046041                 4.935010\n4       1.750297     -1.151816        1.776573   1.826229         0.280372          0.539340        1.371011             1.428493      -0.009560  ...      -1.466770         1.338539    1.220724          0.220556          -0.313395         0.613179              0.729259       -0.868353                -0.397100\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...            ...              ...         ...               ...                ...              ...                   ...             ...                      ...\n564     2.110995      0.721473        2.060786   2.343856         1.041842          0.219060        1.947285             2.320965      -0.312589  ...       0.117700         1.752563    2.015301          0.378365          -0.273318         0.664512              1.629151       -1.360158                -0.709091\n565     1.704854      2.085134        1.615931   1.723842         0.102458         -0.017833        0.693043             1.263669      -0.217664  ...       2.047399         1.421940    1.494959         -0.691230          -0.394820         0.236573              0.733827       -0.531855                -0.973978\n566     0.702284      2.045574        0.672676   0.577953        -0.840484         -0.038680        0.046588             0.105777      -0.809117  ...       1.374854         0.579001    0.427906         -0.809587           0.350735         0.326767              0.414069       -1.104549                -0.318409\n567     1.838341      2.336457        1.982524   1.735218         1.525767          3.272144        3.296944             2.658866       2.137194  ...       2.237926         2.303601    1.653171          1.430427           3.904848         3.197605              2.289985        1.919083                 2.219635\n568    -1.808401      1.221792       -1.814389  -1.347789        -3.112085         -1.150752       -1.114873            -1.261820      -0.820070  ...       0.764190        -1.432735   -1.075813         -1.859019          -1.207552        -1.305831             -1.745063       -0.048138                -0.751207\n\n[569 rows x 30 columns]\n</code></pre>"}, {"location": "API/data_cleaning/scaler/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformApply the inverse transformation to the data.set_paramsSet the parameters of this estimator.transformPerform standardization by centering and scaling.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X, y=None)[source]Apply the inverse transformation to the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Scaled dataframe. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Perform standardization by centering and scaling.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Scaled dataframe. </p> <p></p>"}, {"location": "API/feature_engineering/featureextractor/", "title": "FeatureExtractor", "text": "<p>class atom.feature_engineering.FeatureExtractor(features=('day', 'month', 'year'), fmt=None, encoding_type=\"ordinal\", drop_columns=True, verbose=0, logger=None)[source]Extract features from datetime columns.</p> <p>Create new features extracting datetime elements (day, month, year, etc...) from the provided columns. Columns of dtype <code>datetime64</code> are used as is. Categorical columns that can be successfully converted to a datetime format (less than 30% NaT values after conversion) are also used.</p> <p>This class can be accessed from atom through the feature_extraction method. Read more in the user guide.</p> <p>Warning</p> <p>Decision trees based algorithms build their split rules according to one feature at a time. This means that they will fail to correctly process cyclic features since the sin/cos features should be considered one single coordinate system.</p> <p>Parametersfeatures: str or sequence, default=(\"day\", \"month\", \"year\") Features to create from the datetime columns. Note that created features with zero variance (e.g., the feature hour in a column that only contains dates) are ignored. Allowed values are datetime attributes from <code>pandas.Series.dt</code>. <p>fmt: str, sequence or None, default=None Format (<code>strptime</code>) of the categorical columns that need to be converted to datetime. If sequence, the n-th format corresponds to the n-th categorical column that can be successfully converted. If None, the format is inferred automatically from the first non NaN value. Values that cannot be converted are returned as <code>NaT</code>. <p>encoding_type: str, default=\"ordinal\" Type of encoding to use. Choose from: <ul> <li>\"ordinal\": Encode features in increasing order.</li> <li>\"cyclic\": Encode features using sine and cosine to capture   their cyclic nature. This approach creates two columns for   every feature. Non-cyclic features still use ordinal encoding.</li> </ul> <p>drop_columns: bool, default=True Whether to drop the original columns after transformation. <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>Attributesfeature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>FeatureGenerator Generate new features.</p> <p>FeatureGrouper Extract statistics from similar features.</p> <p>FeatureSelector Reduce the number of features in the data.</p> <p></p>"}, {"location": "API/feature_engineering/featureextractor/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Add a datetime column\n&gt;&gt;&gt; X[\"date\"] = pd.date_range(start=\"1/1/2018\", periods=len(X))\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y)\n&gt;&gt;&gt; atom.feature_extraction(features=[\"day\"], fmt=\"%d/%m/%Y\", verbose=2)\n\nFitting FeatureExtractor...\nExtracting datetime features...\n --&gt; Extracting features from column date.\n   --&gt; Creating feature date_day.\n\n\n&gt;&gt;&gt; # Note the date_day column\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  date_day  target\n0         12.770         22.47           81.72      506.3          0.09055           0.05761         0.04711              0.02704         0.1585  ...       653.6            0.1419             0.1523           0.2177               0.09331          0.2829                  0.08067        16       0\n1         27.420         26.27          186.90     2501.0          0.10840           0.19880         0.36350              0.16890         0.2061  ...      4254.0            0.1357             0.4256           0.6833               0.26250          0.2641                  0.07427         7       0\n2         15.850         23.95          103.70      782.7          0.08401           0.10020         0.09938              0.05364         0.1847  ...       876.5            0.1131             0.1924           0.2322               0.11190          0.2809                  0.06287        14       0\n3         14.190         23.81           92.87      610.7          0.09463           0.13060         0.11150              0.06462         0.2235  ...       811.3            0.1559             0.4059           0.3744               0.17720          0.4724                  0.10260         3       0\n4          8.950         15.76           58.74      245.2          0.09462           0.12430         0.09263              0.02308         0.1305  ...       270.0            0.1179             0.1879           0.1544               0.03846          0.1652                  0.07722        27       1\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...         ...               ...                ...              ...                   ...             ...                      ...       ...     ...\n564       10.800         21.98           68.79      359.9          0.08801           0.05743         0.03614              0.01404         0.2016  ...       489.5            0.1303             0.1696           0.1927               0.07485          0.2965                  0.07662         4       1\n565       11.930         10.91           76.14      442.7          0.08872           0.05242         0.02606              0.01796         0.1601  ...       589.5            0.1374             0.1575           0.1514               0.06876          0.2460                  0.07262         6       1\n566       24.630         21.60          165.50     1841.0          0.10300           0.21060         0.23100              0.14710         0.1991  ...      2642.0            0.1342             0.4188           0.4658               0.24750          0.3157                  0.09671         6       0\n567        6.981         13.43           43.79      143.5          0.11700           0.07568         0.00000              0.00000         0.1930  ...       185.2            0.1584             0.1202           0.0000               0.00000          0.2932                  0.09382        12       1\n568       15.050         19.07           97.26      701.9          0.09215           0.08597         0.07486              0.04335         0.1561  ...       967.0            0.1246             0.2101           0.2866               0.11200          0.2282                  0.06954        30       0\n\n[569 rows x 32 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; import pandas as pd\n&gt;&gt;&gt; from atom.feature_engineering import FeatureExtractor\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Add a datetime column\n&gt;&gt;&gt; X[\"date\"] = pd.date_range(start=\"1/1/2018\", periods=len(X))\n\n&gt;&gt;&gt; fe = FeatureExtractor(features=[\"day\"], fmt=\"%Y-%m-%d\", verbose=2)\n&gt;&gt;&gt; X = fe.transform(X)\n\nExtracting datetime features...\n --&gt; Extracting features from column date.\n   --&gt; Creating feature date_day.\n\n\n&gt;&gt;&gt; # Note the date_day column\n&gt;&gt;&gt; print(X)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  ...  worst perimeter  worst area  worst smoothness  worst compactness  worst concavity  worst concave points  worst symmetry  worst fractal dimension  date_day\n0          17.99         10.38          122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419  ...           184.60      2019.0           0.16220            0.66560           0.7119                0.2654          0.4601                  0.11890         1\n1          20.57         17.77          132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812  ...           158.80      1956.0           0.12380            0.18660           0.2416                0.1860          0.2750                  0.08902         2\n2          19.69         21.25          130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069  ...           152.50      1709.0           0.14440            0.42450           0.4504                0.2430          0.3613                  0.08758         3\n3          11.42         20.38           77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597  ...            98.87       567.7           0.20980            0.86630           0.6869                0.2575          0.6638                  0.17300         4\n4          20.29         14.34          135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809  ...           152.20      1575.0           0.13740            0.20500           0.4000                0.1625          0.2364                  0.07678         5\n..           ...           ...             ...        ...              ...               ...             ...                  ...            ...  ...              ...         ...               ...                ...              ...                   ...             ...                      ...       ...\n564        21.56         22.39          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726  ...           166.10      2027.0           0.14100            0.21130           0.4107                0.2216          0.2060                  0.07115        19\n565        20.13         28.25          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752  ...           155.00      1731.0           0.11660            0.19220           0.3215                0.1628          0.2572                  0.06637        20\n566        16.60         28.08          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590  ...           126.70      1124.0           0.11390            0.30940           0.3403                0.1418          0.2218                  0.07820        21\n567        20.60         29.33          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397  ...           184.60      1821.0           0.16500            0.86810           0.9387                0.2650          0.4087                  0.12400        22\n568         7.76         24.54           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587  ...            59.16       268.6           0.08996            0.06444           0.0000                0.0000          0.2871                  0.07039        23\n\n[569 rows x 31 columns]\n</code></pre>"}, {"location": "API/feature_engineering/featureextractor/#methods", "title": "Methods", "text": "<p>fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformExtract the new features.</p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Do nothing.</p> <p>Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Extract the new features.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed feature set. </p> <p></p>"}, {"location": "API/feature_engineering/featuregenerator/", "title": "FeatureGenerator", "text": "<p>class atom.feature_engineering.FeatureGenerator(strategy=\"dfs\", n_features=None, operators=None, n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs)[source]Generate new features.</p> <p>Create new combinations of existing features to capture the non-linear relations between the original features.</p> <p>This class can be accessed from atom through the feature_generation method. Read more in the user guide.</p> <p>Warning</p> <ul> <li>Using the <code>div</code>, <code>log</code> or <code>sqrt</code> operators can return new   features with <code>inf</code> or <code>NaN</code> values. Check the warnings that   may pop up or use atom's nans attribute.</li> <li>When using dfs with <code>n_jobs&gt;1</code>, make sure to protect your code   with <code>if __name__ == \"__main__\"</code>. Featuretools uses   dask, which uses python multiprocessing   for parallelization. The spawn method on multiprocessing   starts a new python process, which requires it to import the   __main__ module before it can do its task.</li> <li>gfg can be slow for very large populations.</li> </ul> <p>Tip</p> <p>dfs can create many new features and not all of them will be useful. Use the FeatureSelector class to reduce the number of features.</p> <p>Parametersstrategy: str, default=\"dfs\" Strategy to crate new features. Choose from: <ul> <li>\"dfs\": Deep Feature Synthesis.</li> <li>\"gfg\": Genetic Feature Generation.</li> </ul> <p>n_features: int or None, default=None Maximum number of newly generated features to add to the dataset. If None, select all created features. <p>operators: str, sequence or None, default=None Mathematical operators to apply on the features. None to use all. Choose from: <code>add</code>, <code>sub</code>, <code>mul</code>, <code>div</code>, <code>abs</code>, <code>sqrt</code>, <code>log</code>, <code>inv</code>, <code>sin</code>, <code>cos</code>, <code>tan</code>. <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p>**kwargs Additional keyword arguments for the SymbolicTransformer instance. Only for the gfg strategy. <p>Attributesgfg_: SymbolicTransformer Object used to calculate the genetic features. Only available when strategy=\"gfg\". <p>genetic_features_: pd.DataFrame Information on the newly created non-linear features. Only available when strategy=\"gfg\". Columns include: <ul> <li>name: Name of the feature (generated automatically).</li> <li>description: Operators used to create this feature.</li> <li>fitness: Fitness score.</li> </ul> <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>FeatureExtractor Extract features from datetime columns.</p> <p>FeatureGrouper Extract statistics from similar features.</p> <p>FeatureSelector Reduce the number of features in the data.</p> <p></p>"}, {"location": "API/feature_engineering/featuregenerator/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y)\n&gt;&gt;&gt; atom.feature_generation(strategy=\"dfs\", n_features=5, verbose=2)\n\nFitting FeatureGenerator...\nGenerating new features...\n --&gt; 5 new features were added.\n\n\n&gt;&gt;&gt; # Note the texture error / worst symmetry column\n&gt;&gt;&gt; print(atom.dataset)\n\n     mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  ...  mean concave points * smoothness error  mean concavity + worst radius  mean radius / smoothness error  worst concave points * worst radius  worst radius / concave points error  target\n0         13.280         13.72           85.79      541.8          0.08363           0.08575  ...                                0.000122                       14.29077                     3109.342074                             1.306235                          1681.624941       1\n1         15.460         11.89          102.50      736.9          0.12570           0.15550  ...                                0.000592                       18.99320                     2866.679028                             3.432933                          1423.484848       0\n2         13.110         15.56           87.21      530.2          0.13980           0.17650  ...                                0.000688                       16.51710                     1830.494275                             3.239166                          1175.072046       0\n3          9.847         15.68           63.00      293.2          0.09492           0.08419  ...                                0.000211                       11.26330                     1127.691251                             0.733747                          1652.698133       1\n4         14.870         20.21           96.12      680.9          0.09587           0.08345  ...                                0.000268                       16.07824                     2746.075716                             1.628217                          1353.338969       1\n..           ...           ...             ...        ...              ...               ...  ...                                     ...                            ...                             ...                                  ...                                  ...     ...\n564       14.470         24.99           95.81      656.4          0.08837           0.12300  ...                                0.000278                       16.32090                     2027.178481                             1.954510                          1395.869191       1\n565       19.690         21.25          130.00     1203.0          0.10960           0.15990  ...                                0.000787                       23.76740                     3201.626016                             5.727510                          1145.286686       0\n566       19.270         26.47          127.90     1162.0          0.09401           0.17190  ...                                0.000381                       24.31570                     3842.472582                             4.310775                          2504.407342       0\n567       11.760         18.14           75.00      431.1          0.09968           0.05914  ...                                0.000197                       13.38685                     2101.501072                             0.956576                           932.960894       0\n568       14.580         13.66           94.29      658.8          0.09832           0.08918  ...                                0.000215                       16.84222                     2943.670503                             1.539574                          1938.020352       1\n\n[569 rows x 36 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.feature_engineering import FeatureGenerator\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; fg = FeatureGenerator(strategy=\"dfs\", n_features=5, verbose=2)\n&gt;&gt;&gt; X = fg.fit_transform(X, y)\n\nFitting FeatureGenerator...\nGenerating new features...\n --&gt; 5 new features were added.\n\n\n&gt;&gt;&gt; # Note the radius error * worst smoothness column\n&gt;&gt;&gt; print(X)\n\n       mean radius  mean texture  mean perimeter  mean area  mean smoothness  mean compactness  ...  worst fractal dimension  mean area - perimeter error  mean texture * worst fractal dimension  symmetry error / concave points error  texture error * worst area  worst radius / compactness error\nindex                                                                                           ...                                                                                                                                                                                                   \n0            17.99         10.38          122.80     1001.0          0.11840           0.27760  ...                  0.11890                      992.411                                1.234182                               1.892250                   1827.8007                        517.536705\n1            20.57         17.77          132.90     1326.0          0.08474           0.07864  ...                  0.08902                     1322.602                                1.581885                               1.036567                   1435.5084                       1910.550459\n2            19.69         21.25          130.00     1203.0          0.10960           0.15990  ...                  0.08758                     1198.415                                1.861075                               1.093294                   1344.8121                        588.367449\n3            11.42         20.38           77.58      386.1          0.14250           0.28390  ...                  0.17300                      382.655                                3.525740                               3.193894                    656.2612                        199.919549\n4            20.29         14.34          135.10     1297.0          0.10030           0.13280  ...                  0.07678                     1291.562                                1.101025                               0.931565                   1230.5475                        915.887850\n...            ...           ...             ...        ...              ...               ...  ...                      ...                          ...                                     ...                                    ...                         ...                               ...\n564          21.56         22.39          142.00     1479.0          0.11100           0.11590  ...                  0.07115                     1471.327                                1.593049                               0.453953                   2545.9120                        880.318229\n565          20.13         28.25          131.20     1261.0          0.09780           0.10340  ...                  0.06637                     1255.797                                1.874953                               1.131108                   4263.4530                        977.713578\n566          16.60         28.08          108.30      858.1          0.08455           0.10230  ...                  0.07820                      854.675                                2.195856                               0.846500                   1208.3000                        508.710801\n567          20.60         29.33          140.10     1265.0          0.11780           0.27700  ...                  0.12400                     1259.228                                3.636920                               1.396635                   2904.4950                        417.992855\n568           7.76         24.54           47.92      181.0          0.05263           0.04362  ...                  0.07039                      178.452                                1.727371                                    inf                    383.5608                       2029.184549\n\n[569 rows x 35 columns]\n</code></pre>"}, {"location": "API/feature_engineering/featuregenerator/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformGenerate new features.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Generate new features.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed feature set. </p> <p></p>"}, {"location": "API/feature_engineering/featuregrouper/", "title": "FeatureGrouper", "text": "<p>class atom.feature_engineering.FeatureGrouper(groups, operators=None, drop_columns=True, verbose=0, logger=None)[source]Extract statistics from similar features.</p> <p>Replace groups of features with related characteristics with new features that summarize statistical properties of the group. The statistical operators are calculated over every row of the group. The group names and features can be accessed through the <code>groups</code> method.</p> <p>This class can be accessed from atom through the feature_grouping method. Read more in the user guide.</p> <p>Parametersgroups: dict Group names and features. A feature can belong to multiple groups. <p>operators: str, sequence or None, default=None Statistical operators to apply on the groups. Any operator from <code>numpy</code> or <code>scipy.stats</code> (checked in that order) that is applied on an array can be used. If None, it uses: <code>min</code>, <code>max</code>, <code>mean</code>, <code>median</code>, <code>mode</code> and <code>std</code>. <p>drop_columns: bool, default=True Whether to drop the columns in <code>groups</code> after transformation. <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>Attributesfeature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>FeatureExtractor Extract features from datetime columns.</p> <p>FeatureGenerator Generate new features.</p> <p>FeatureSelector Reduce the number of features in the data.</p> <p></p>"}, {"location": "API/feature_engineering/featuregrouper/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y)\n&gt;&gt;&gt; atom.feature_grouping({\"group1\": \"mean.*\"}, verbose=2)\n\nFitting FeatureGrouper...\nGrouping features...\n --&gt; Group group1 successfully created.\n\n\n&gt;&gt;&gt; print(atom.dataset)\n\n     radius error  texture error  perimeter error  area error  smoothness error  compactness error  concavity error  concave points error  symmetry error  ...  worst symmetry  worst fractal dimension  min(group1)  max(group1)  mean(group1)  median(group1)  mode(group1)  std(group1)  target\n0          0.5190         2.9100            5.801       67.10          0.007545           0.060500         0.021340              0.018430         0.03056  ...          0.2311                  0.09203      0.07224       1132.0    130.736684        0.186400       0.07224   335.890773       0\n1          0.4564         1.0750            3.425       48.55          0.005903           0.037310         0.047300              0.015570         0.01318  ...          0.2218                  0.07820      0.05302        858.1    101.162786        0.130650       0.05302   254.320568       0\n2          0.2298         0.9988            1.534       22.18          0.002826           0.009105         0.013110              0.005174         0.01013  ...          0.2683                  0.06829      0.02847        758.6     89.400425        0.116550       0.02847   224.981976       0\n3          0.3117         0.8155            1.972       27.94          0.005217           0.015150         0.016780              0.012680         0.01669  ...          0.2723                  0.07071      0.05723        761.7     89.389875        0.138110       0.09462   226.081026       1\n4          0.3336         1.8600            2.041       19.91          0.011880           0.037470         0.045910              0.015440         0.02287  ...          0.2383                  0.09026      0.03068        334.2     43.414796        0.161250       0.03068    99.030712       1\n..            ...            ...              ...         ...               ...                ...              ...                   ...             ...  ...             ...                      ...          ...          ...           ...             ...           ...          ...     ...\n564        0.4727         1.2400            3.195       45.40          0.005718           0.011620         0.019980              0.011090         0.01410  ...          0.3029                  0.08216      0.05259        684.5     81.456503        0.128635       0.05259   202.924880       0\n565        0.8601         1.4800            7.029      111.70          0.008124           0.036110         0.054890              0.027650         0.03176  ...          0.2909                  0.05865      0.05024       1290.0    146.813205        0.170250       0.05024   383.094862       0\n566        0.2094         0.7636            1.231       17.67          0.008725           0.020030         0.023350              0.011320         0.02625  ...          0.3380                  0.09584      0.03370        513.7     62.632288        0.136750       0.03370   152.314252       1\n567        0.2818         0.7614            1.808       18.54          0.006142           0.006134         0.001835              0.003576         0.01637  ...          0.2738                  0.07685      0.00309        366.8     45.967364        0.109675       0.00309   108.819747       1\n568        0.2810         0.8135            3.369       23.81          0.004929           0.066570         0.076830              0.013680         0.01526  ...          0.2845                  0.12490      0.02833        542.9     66.369889        0.141200       0.02833   160.878141       1\n\n[569 rows x 27 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.feature_engineering import FeatureGrouper\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; fg = FeatureGrouper({\"group1\": [\"mean texture\", \"mean radius\"]}, verbose=2)\n&gt;&gt;&gt; X = fg.transform(X)\n\nGrouping features...\n --&gt; Group group1 successfully created.\n\n\n&gt;&gt;&gt; print(X)\n\n     mean perimeter  mean area  mean smoothness  mean compactness  mean concavity  mean concave points  mean symmetry  mean fractal dimension  radius error  ...  worst concave points  worst symmetry  worst fractal dimension  min(group1)  max(group1)  mean(group1)  median(group1)  mode(group1)  std(group1)\n0            122.80     1001.0          0.11840           0.27760         0.30010              0.14710         0.2419                 0.07871        1.0950  ...                0.2654          0.4601                  0.11890        10.38        17.99        14.185          14.185         10.38        3.805\n1            132.90     1326.0          0.08474           0.07864         0.08690              0.07017         0.1812                 0.05667        0.5435  ...                0.1860          0.2750                  0.08902        17.77        20.57        19.170          19.170         17.77        1.400\n2            130.00     1203.0          0.10960           0.15990         0.19740              0.12790         0.2069                 0.05999        0.7456  ...                0.2430          0.3613                  0.08758        19.69        21.25        20.470          20.470         19.69        0.780\n3             77.58      386.1          0.14250           0.28390         0.24140              0.10520         0.2597                 0.09744        0.4956  ...                0.2575          0.6638                  0.17300        11.42        20.38        15.900          15.900         11.42        4.480\n4            135.10     1297.0          0.10030           0.13280         0.19800              0.10430         0.1809                 0.05883        0.7572  ...                0.1625          0.2364                  0.07678        14.34        20.29        17.315          17.315         14.34        2.975\n..              ...        ...              ...               ...             ...                  ...            ...                     ...           ...  ...                   ...             ...                      ...          ...          ...           ...             ...           ...          ...\n564          142.00     1479.0          0.11100           0.11590         0.24390              0.13890         0.1726                 0.05623        1.1760  ...                0.2216          0.2060                  0.07115        21.56        22.39        21.975          21.975         21.56        0.415\n565          131.20     1261.0          0.09780           0.10340         0.14400              0.09791         0.1752                 0.05533        0.7655  ...                0.1628          0.2572                  0.06637        20.13        28.25        24.190          24.190         20.13        4.060\n566          108.30      858.1          0.08455           0.10230         0.09251              0.05302         0.1590                 0.05648        0.4564  ...                0.1418          0.2218                  0.07820        16.60        28.08        22.340          22.340         16.60        5.740\n567          140.10     1265.0          0.11780           0.27700         0.35140              0.15200         0.2397                 0.07016        0.7260  ...                0.2650          0.4087                  0.12400        20.60        29.33        24.965          24.965         20.60        4.365\n568           47.92      181.0          0.05263           0.04362         0.00000              0.00000         0.1587                 0.05884        0.3857  ...                0.0000          0.2871                  0.07039         7.76        24.54        16.150          16.150          7.76        8.390\n\n[569 rows x 34 columns]\n</code></pre>"}, {"location": "API/feature_engineering/featuregrouper/#methods", "title": "Methods", "text": "<p>fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformGroup features.</p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Do nothing.</p> <p>Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Group features.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed feature set. </p> <p></p>"}, {"location": "API/feature_engineering/featureselector/", "title": "FeatureSelector", "text": "<p>class atom.feature_engineering.FeatureSelector(strategy=None, solver=None, n_features=None, min_repeated=2, max_repeated=1.0, max_correlation=1.0, n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", verbose=0, logger=None, random_state=None, **kwargs)[source]Reduce the number of features in the data.</p> <p>Apply feature selection or dimensionality reduction, either to improve the estimators' accuracy or to boost their performance on very high-dimensional datasets. Additionally, remove multicollinear and low-variance features.</p> <p>This class can be accessed from atom through the feature_selection method. Read more in the user guide.</p> <p>Warning</p> <ul> <li>Ties between features with equal scores are broken in an   unspecified way.</li> <li>For strategy=\"rfecv\", the <code>n_features</code> parameter is the   minimum number of features to select, not the actual   number of features that the transformer returns. It may very   well be that it returns more!</li> </ul> <p>Info</p> <ul> <li>The \"sklearnex\" and \"cuml\" engines are only supported for   strategy=\"pca\" with dense datasets.</li> <li>If strategy=\"pca\" and the data is dense and unscaled, it's   scaled to mean=0 and std=1 before fitting the PCA transformer.</li> <li>If strategy=\"pca\" and the provided data is sparse, the used   estimator is TruncatedSVD, which works more efficiently   with sparse matrices.</li> </ul> <p>Tip</p> <ul> <li>Use the plot_pca and plot_components methods to   examine the results after using strategy=\"pca\".</li> <li>Use the plot_rfecv method to examine the results after   using strategy=\"rfecv\".</li> <li>Use the plot_feature_importance method to examine how   much a specific feature contributes to the final predictions.   If the model doesn't have a <code>feature_importances_</code> attribute,   use plot_permutation_importance instead.</li> </ul> <p>Parametersstrategy: str or None, default=None Feature selection strategy to use. Choose from: <ul> <li>None: Do not perform any feature selection strategy.</li> <li>\"univariate\": Univariate statistical F-test.</li> <li>\"pca\": Principal Component Analysis.</li> <li>\"sfm\": Select best features according to a model.</li> <li>\"sfs\": Sequential Feature Selection.</li> <li>\"rfe\": Recursive Feature Elimination.</li> <li>\"rfecv\": RFE with cross-validated selection.</li> <li>\"pso\": Particle Swarm Optimization.</li> <li>\"hho\": Harris Hawks Optimization.</li> <li>\"gwo\": Grey Wolf Optimization.</li> <li>\"dfo\": Dragonfly Optimization.</li> <li>\"go\": Genetic Optimization.</li> </ul> <p>solver: str, func, estimator or None, default=None Solver/estimator to use for the feature selection strategy. See the corresponding documentation for an extended description of the choices. If None, the default value is used (only if strategy=\"pca\"). Choose from: <ul> <li> <p>If strategy=\"univariate\":</p> <ul> <li>\"f_classif\"</li> <li>\"f_regression\"</li> <li>\"mutual_info_classif\"</li> <li>\"mutual_info_regression\"</li> <li>\"chi2\"</li> <li>Any function with signature <code>func(X, y) -&gt; tuple[scores, p-values]</code>.</li> </ul> </li> <li> <p>If strategy=\"pca\":</p> <ul> <li> <p>If data is dense:</p> <ul> <li> <p>If engine=\"sklearn\":</p> <ul> <li>\"auto\" (default)</li> <li>\"full\"</li> <li>\"arpack\"</li> <li>\"randomized\"</li> </ul> </li> <li> <p>If engine=\"sklearnex\":</p> <ul> <li>\"full\" (default)</li> </ul> </li> <li> <p>If engine=\"cuml\":</p> <ul> <li>\"full\" (default)</li> <li>\"jacobi\"</li> </ul> </li> </ul> </li> <li> <p>If data is sparse:</p> <ul> <li>\"randomized\" (default)</li> <li>\"arpack\"</li> </ul> </li> </ul> </li> <li> <p>for the remaining strategies:   The base estimator. For sfm, rfe and rfecv, it should have   either a <code>feature_importances_</code> or <code>coef_</code> attribute after   fitting. You can use one of the predefined models. Add   <code>_class</code> or <code>_reg</code> after the model's  name to specify a   classification or regression task, e.g., <code>solver=\"LGB_reg\"</code>   (not necessary if called from atom). No default option.</p> </li> </ul> <p>n_features: int, float or None, default=None Number of features to select. <ul> <li>If None: Select all features.</li> <li>If &lt;1: Fraction of the total features to select.</li> <li>If &gt;=1: Number of features to select.</li> </ul> <p>If strategy=\"sfm\" and the threshold parameter is not specified, the threshold is automatically set to <code>-inf</code> to select <code>n_features</code> number of features.</p> <p>If strategy=\"rfecv\", <code>n_features</code> is the minimum number of features to select.</p> <p>This parameter is ignored if any of the following strategies is selected: pso, hho, gwo, dfo, go.</p> <p>min_repeated: int, float or None, default=2 Remove categorical features if there isn't any repeated value in at least <code>min_repeated</code> rows. The default is to keep all features with non-maximum variance, i.e., remove the features which number of unique values is equal to the number of rows (usually the case for names, IDs, etc...). <ul> <li>If None: No check for minimum repetition.</li> <li>If &gt;1: Minimum repetition number.</li> <li>If &lt;=1: Minimum repetition fraction.</li> </ul> <p>max_repeated: int, float or None, default=1.0 Remove categorical features with the same value in at least <code>max_repeated</code> rows. The default is to keep all features with non-zero variance, i.e., remove the features that have the same value in all samples. <ul> <li>If None: No check for maximum repetition.</li> <li>If &gt;1: Maximum number of repeated occurences.</li> <li>If &lt;=1: Maximum fraction of repeated occurences.</li> </ul> <p>max_correlation: float or None, default=1.0 Minimum absolute Pearson correlation to identify correlated features. For each group, it removes all except the feature with the highest correlation to <code>y</code> (if provided, else it removes all but the first). The default value removes equal columns. If None, skip this step. <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p>**kwargs Any extra keyword argument for the strategy estimator. See the corresponding documentation for the available options. <p>Attributescollinear_: pd.DataFrame Information on the removed collinear features. Columns include: <ul> <li>drop: Name of the dropped feature.</li> <li>corr_feature: Names of the correlated features.</li> <li>corr_value: Corresponding correlation coefficients.</li> </ul> <p>[strategy]_: sklearn transformer Object used to transform the data, e.g., <code>fs.pca</code> for the pca strategy. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>FeatureExtractor Extract features from datetime columns.</p> <p>FeatureGenerator Generate new features.</p> <p>FeatureGrouper Extract statistics from similar features.</p> <p></p>"}, {"location": "API/feature_engineering/featureselector/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y)\n&gt;&gt;&gt; atom.feature_selection(strategy=\"pca\", n_features=12, verbose=2)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --&gt; Applying Principal Component Analysis...\n   --&gt; Scaling features...\n   --&gt; Keeping 12 components.\n   --&gt; Explained variance ratio: 0.971\n\n\n&gt;&gt;&gt; # Note that the column names changed\n&gt;&gt;&gt; print(atom.dataset)\n\n         pca0      pca1      pca2      pca3      pca4      pca5      pca6      pca7      pca8      pca9     pca10     pca11  target\n0    1.933532  2.215152  1.268851 -1.776239  0.069615 -0.043647  0.281363  0.122942 -0.911086 -0.223754 -0.086316 -0.929486       1\n1    1.203025  6.706587  4.445104  0.087116  3.044271 -1.130720  0.820790 -0.593311 -1.004105  0.945411 -0.199241  0.948766       1\n2    4.506063 -1.419715 -1.216228  1.189962  0.227850  0.788522 -0.829805  0.521853 -0.381054  0.676945  0.004564  0.066630       0\n3   -2.179059  0.496110 -0.870279 -0.151235 -0.715354  0.983901 -0.232186  0.449653  0.350218  0.644448  0.280308 -0.544707       1\n4    0.708048  0.859536 -2.683579  0.295765  0.712158 -1.105250 -0.226270 -0.264257  0.494656 -0.643629 -0.152528 -0.008835       0\n..        ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...     ...\n564 -2.477152 -1.482251 -0.389774 -0.333742  0.627651 -0.475717 -0.048757 -0.337669  0.382336  0.132000  0.204445  0.118625       1\n565 -0.400165  0.078366 -2.082886 -1.024593  0.623709 -1.003931  0.571384  0.248557 -0.489957 -0.397008 -0.132552 -0.162104       0\n566 -2.956303 -0.111232 -0.770455  0.035805  0.308638  0.311849  0.119611 -0.994997  0.495694 -0.130586  0.214798  0.358027       1\n567 -5.409548 -0.784989  1.540835  2.205277  0.249963  1.552586  1.837439 -0.796343  0.508352  0.011600 -0.066693 -0.006518       1\n568 -3.648393 -1.340745  0.503077  4.546174 -0.221396  1.229170  0.687803  0.711380  0.527799  0.139843 -0.958308  0.834252       1\n\n[569 rows x 13 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.feature_engineering import FeatureSelector\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, _ = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; fs = FeatureSelector(strategy=\"pca\", n_features=12, verbose=2)\n&gt;&gt;&gt; X = fs.fit_transform(X)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --&gt; Applying Principal Component Analysis...\n   --&gt; Scaling features...\n   --&gt; Keeping 12 components.\n   --&gt; Explained variance ratio: 0.97\n\n\n&gt;&gt;&gt; # Note that the column names changed\n&gt;&gt;&gt; print(X)\n\n          pca0       pca1      pca2      pca3      pca4      pca5      pca6      pca7      pca8      pca9     pca10     pca11\n0     9.192837   1.948583 -1.123166  3.633731 -1.195110  1.411424  2.159370 -0.398407 -0.157118 -0.877402  0.262955 -0.859014\n1     2.387802  -3.768172 -0.529293  1.118264  0.621775  0.028656  0.013358  0.240988 -0.711905  1.106995  0.813120  0.157923\n2     5.733896  -1.075174 -0.551748  0.912083 -0.177086  0.541452 -0.668166  0.097374  0.024066  0.454275 -0.605604  0.124387\n3     7.122953  10.275589 -3.232790  0.152547 -2.960878  3.053422  1.429911  1.059565 -1.405440 -1.116975 -1.151514  1.011316\n4     3.935302  -1.948072  1.389767  2.940639  0.546747 -1.226495 -0.936213  0.636376 -0.263805  0.377704  0.651360 -0.110515\n..         ...        ...       ...       ...       ...       ...       ...       ...       ...       ...       ...       ...\n564   6.439315  -3.576817  2.459487  1.177314 -0.074824 -2.375193 -0.596130 -0.035471  0.987929  0.256989 -0.062651  0.123342\n565   3.793382  -3.584048  2.088476 -2.506028 -0.510723 -0.246710 -0.716326 -1.113360 -0.105207 -0.108632  0.244804  0.222753\n566   1.256179  -1.902297  0.562731 -2.089227  1.809991 -0.534447 -0.192758  0.341887  0.393917  0.520877 -0.840512  0.096473\n567  10.374794   1.672010 -1.877029 -2.356031 -0.033742  0.567936  0.223082 -0.280239 -0.542035 -0.089296 -0.178628 -0.697461\n568  -5.475243  -0.670637  1.490443 -2.299157 -0.184703  1.617837  1.698952  1.046354  0.374101 -0.047726 -0.144094 -0.179496\n\n[569 rows x 12 columns]\n</code></pre>"}, {"location": "API/feature_engineering/featureselector/#methods", "title": "Methods", "text": "<p>fitFit the feature selector to the data.fit_transformFit to data, then transform it.get_metadata_routingGet metadata routing of this object.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformTransform the data.</p> <p></p> <p>method fit(X, y=None)[source]Fit the feature selector to the data.</p> <p>The univariate, sfm (when model is not fitted), sfs, rfe and rfecv strategies need a target column. Leaving it None raises an exception.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_metadata_routing()[source]Get metadata routing of this object.</p> <p>Returnsrouting : MetadataRequest A :class:<code>~sklearn.utils.metadata_routing.MetadataRequest</code> encapsulating routing information. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Transform the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed feature set. </p> <p></p>"}, {"location": "API/models/adab/", "title": "AdaBoost", "text": "<p>AdaB accept sparse</p> <p>AdaBoost is a meta-estimator that begins by fitting a classifier/regressor on the original dataset and then fits additional copies of the algorithm on the same dataset but where the weights of instances are adjusted according to the error of the current prediction.</p> <p>Corresponding estimators are:</p> <ul> <li>AdaBoostClassifier for classification tasks.</li> <li>AdaBoostRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>GradientBoostingMachine Gradient Boosting Machine.</p> <p>RandomForest Random Forest.</p> <p>XGBoost Extreme Gradient Boosting.</p> <p></p>"}, {"location": "API/models/adab/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"AdaB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: AdaB\nMetric: f1\n\n\nResults for AdaBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9583\nTime elapsed: 0.221s\n-------------------------------------------------\nTime: 0.221s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.224s\n-------------------------------------\nAdaBoost --&gt; f1: 0.9583\n</code></pre>"}, {"location": "API/models/adab/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=50, step=10)learning_rateFloatDistribution(high=10.0, log=True, low=0.01, step=None)algorithmCategoricalDistribution(choices=('SAMME.R', 'SAMME'))</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=50, step=10)learning_rateFloatDistribution(high=10.0, log=True, low=0.01, step=None)lossCategoricalDistribution(choices=('linear', 'square', 'exponential'))</p> <p></p> <p></p>"}, {"location": "API/models/adab/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/adab/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/adab/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/adab/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/ard/", "title": "AutomaticRelevanceDetermination", "text": "<p>ARD needs scaling</p> <p>Automatic Relevance Determination is very similar to BayesianRidge, but can lead to sparser coefficients. Fit the weights of a regression model, using an ARD prior. The weights of the regression model are assumed to be in Gaussian distributions.</p> <p>Corresponding estimators are:</p> <ul> <li>ARDRegression for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>BayesianRidge Bayesian ridge regression.</p> <p>GaussianProcess Gaussian process.</p> <p>LeastAngleRegression Least Angle Regression.</p> <p></p>"}, {"location": "API/models/ard/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"ARD\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: ARD\nMetric: r2\n\n\nResults for AutomaticRelevanceDetermination:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6067\nTest evaluation --&gt; r2: 0.6029\nTime elapsed: 0.139s\n-------------------------------------------------\nTime: 0.139s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.140s\n-------------------------------------\nAutomaticRelevanceDetermination --&gt; r2: 0.6029\n</code></pre>"}, {"location": "API/models/ard/#hyperparameters", "title": "Hyperparameters", "text": "<p>Parametersn_iterIntDistribution(high=1000, log=False, low=100, step=10)alpha_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)alpha_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/ard/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ard/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/ard/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/ard/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/arima/", "title": "ARIMA", "text": "<p>ARIMA native multioutput</p> <p>Seasonal ARIMA models and exogeneous input is supported, hence this estimator is capable of fitting SARIMA, ARIMAX, and SARIMAX.</p> <p>An ARIMA model, is a generalization of an autoregressive moving average (ARMA) model, and is fitted to time-series data in an effort to forecast future points. ARIMA models can be especially efficacious in cases where data shows evidence of non-stationarity.</p> <p>The \"AR\" part of ARIMA indicates that the evolving variable of interest is regressed on its own lagged (i.e., prior observed) values. The \"MA\" part indicates that the regression error is actually a linear combination of error terms whose values occurred contemporaneously and at various times in the past. The \"I\" (for \"integrated\") indicates that the data values have been replaced with the difference between their values and the previous values (and this differencing process may have been performed more than once).</p> <p>Corresponding estimators are:</p> <ul> <li>ARIMA for forecasting tasks.</li> </ul> <p>Warning</p> <p>ARIMA often runs into numerical errors when optimizing the hyperparameters. Possible solutions are:</p> <ul> <li>Use the AutoARIMA model instead.</li> <li>Use <code>est_params</code> to specify the   orders manually, e.g., <code>atom.run(\"arima\", n_trials=5,est_params={\"order\": (1, 1, 0)})</code>.</li> <li>Use the <code>catch</code> parameter in <code>ht_params</code>   to avoid raising every exception, e.g., <code>atom.run(\"arima\",n_trials=5, ht_params={\"catch\": (Exception,)})</code>.</li> </ul> <p></p> <p>See Also</p> <p>AutoARIMA Automatic Autoregressive Integrated Moving Average Model.</p> <p></p>"}, {"location": "API/models/arima/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_longley\n\n&gt;&gt;&gt; _, X = load_longley()\n\n&gt;&gt;&gt; atom = ATOMForecaster(X)\n&gt;&gt;&gt; atom.run(models=\"ARIMA\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: ARIMA\nMetric: mape\n\n\nResults for ARIMA:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0131\nTest evaluation --&gt; mape: -0.0364\nTime elapsed: 0.214s\n-------------------------------------------------\nTime: 0.214s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.215s\n-------------------------------------\nARIMA --&gt; mape: -0.0364\n</code></pre>"}, {"location": "API/models/arima/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParameterspIntDistribution(high=2, log=False, low=0, step=1)dIntDistribution(high=1, log=False, low=0, step=1)qIntDistribution(high=2, log=False, low=0, step=1)PIntDistribution(high=2, log=False, low=0, step=1)DIntDistribution(high=1, log=False, low=0, step=1)QIntDistribution(high=2, log=False, low=0, step=1)SCategoricalDistribution(choices=(0, 4, 6, 7, 12))methodCategoricalDistribution(choices=('newton', 'nm', 'bfgs', 'lbfgs', 'powell', 'cg', 'ncg', 'basinhopping'))maxiterIntDistribution(high=200, log=False, low=50, step=10)with_interceptCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/arima/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/arima/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/arima/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/arima/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_interval</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>marginal: bool, default=True Whether returned distribution is marginal by time index. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnssktime.proba.Normal Predicted distribution. </p> <p></p> <p>method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_quantiles</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_residuals</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>y</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_var</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at. <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of y with respect to a ground truth. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/autoarima/", "title": "AutoARIMA", "text": "<p>AutoARIMA native multioutput</p> <p>ARIMA implementation that includes automated fitting of (S)ARIMA(X) hyperparameters (p, d, q, P, D, Q). The AutoARIMA algorithm seeks to identify the most optimal parameters for an ARIMA model, settling on a single fitted ARIMA model. This process is based on the commonly-used R function.</p> <p>AutoARIMA works by conducting differencing tests (i.e., Kwiatkowski\u2013Phillips\u2013Schmidt\u2013Shin, Augmented Dickey-Fuller or Phillips\u2013Perron) to determine the order of differencing, d, and then fitting models within defined ranges. AutoARIMA also seeks to identify the optimal P and Q hyperparameters after conducting the Canova-Hansen to determine the optimal order of seasonal differencing.</p> <p>Note that due to stationarity issues, AutoARIMA might not find a suitable model that will converge. If this is the case, a ValueError is thrown suggesting stationarity-inducing measures be taken prior to re-fitting or that a new range of order values be selected.</p> <p>Corresponding estimators are:</p> <ul> <li>AutoARIMA for forecasting tasks.</li> </ul> <p></p> <p>See Also</p> <p>ARIMA Autoregressive Integrated Moving Average Model.</p> <p>ETS ETS model with automatic fitting capabilities.</p> <p></p>"}, {"location": "API/models/autoarima/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_longley\n\n&gt;&gt;&gt; _, X = load_longley()\n\n&gt;&gt;&gt; atom = ATOMForecaster(X, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"autoarima\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: AutoARIMA\nMetric: mape\n\n\nResults for AutoARIMA:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0131\nTest evaluation --&gt; mape: -0.0359\nTime elapsed: 0.437s\n-------------------------------------------------\nTime: 0.437s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.438s\n-------------------------------------\nAutoARIMA --&gt; mape: -0.0359\n</code></pre>"}, {"location": "API/models/autoarima/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParametersmethodCategoricalDistribution(choices=('newton', 'nm', 'bfgs', 'lbfgs', 'powell', 'cg', 'ncg', 'basinhopping'))maxiterIntDistribution(high=200, log=False, low=50, step=10)with_interceptCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/autoarima/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/autoarima/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/autoarima/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/autoarima/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_interval</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>marginal: bool, default=True Whether returned distribution is marginal by time index. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnssktime.proba.Normal Predicted distribution. </p> <p></p> <p>method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_quantiles</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_residuals</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>y</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_var</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at. <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of y with respect to a ground truth. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/bag/", "title": "Bagging", "text": "<p>Bag accept sparse</p> <p>Bagging uses an ensemble meta-estimator that fits base predictors on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator by introducing randomization into its construction procedure and then making an ensemble out of it.</p> <p>Corresponding estimators are:</p> <ul> <li>BaggingClassifier for classification tasks.</li> <li>BaggingRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>DecisionTree Single Decision Tree.</p> <p>LogisticRegression Logistic Regression.</p> <p>RandomForest Random Forest.</p> <p></p>"}, {"location": "API/models/bag/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Bag\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Bag\nMetric: f1\n\n\nResults for Bagging:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9982\nTest evaluation --&gt; f1: 0.9444\nTime elapsed: 0.101s\n-------------------------------------------------\nTime: 0.101s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.104s\n-------------------------------------\nBagging --&gt; f1: 0.9444\n</code></pre>"}, {"location": "API/models/bag/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)max_samplesFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)max_featuresFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)bootstrapCategoricalDistribution(choices=(True, False))bootstrap_featuresCategoricalDistribution(choices=(True, False))</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)max_samplesFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)max_featuresFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)bootstrapCategoricalDistribution(choices=(True, False))bootstrap_featuresCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/bag/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/bag/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/bag/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/bag/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/bnb/", "title": "BernoulliNB", "text": "<p>BNB accept sparse supports acceleration</p> <p>BernoulliNB implements the Naive Bayes algorithm for multivariate Bernoulli models. Like MultinomialNB, this classifier is suitable for discrete data. The difference is that while MNB works with occurrence counts, BNB is designed for binary/boolean features.</p> <p>Corresponding estimators are:</p> <ul> <li>BernoulliNB for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>ComplementNB Complement Naive Bayes.</p> <p>CategoricalNB Categorical Naive Bayes.</p> <p>MultinomialNB Multinomial Naive Bayes.</p> <p></p>"}, {"location": "API/models/bnb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"BNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: BNB\nMetric: f1\n\n\nResults for BernoulliNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.7709\nTest evaluation --&gt; f1: 0.7717\nTime elapsed: 0.023s\n-------------------------------------------------\nTime: 0.023s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.026s\n-------------------------------------\nBernoulliNB --&gt; f1: 0.7717\n</code></pre>"}, {"location": "API/models/bnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/bnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/bnb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/bnb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/bnb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/br/", "title": "BayesianRidge", "text": "<p>BR needs scaling</p> <p>Bayesian regression techniques can be used to include regularization parameters in the estimation procedure: the regularization parameter is not set in a hard sense but tuned to the data at hand.</p> <p>Corresponding estimators are:</p> <ul> <li>BayesianRidge for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>AutomaticRelevanceDetermination Automatic Relevance Determination.</p> <p>GaussianProcess Gaussian process.</p> <p>LeastAngleRegression Least Angle Regression.</p> <p></p>"}, {"location": "API/models/br/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"BR\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: BR\nMetric: r2\n\n\nResults for BayesianRidge:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6067\nTest evaluation --&gt; r2: 0.6028\nTime elapsed: 0.138s\n-------------------------------------------------\nTime: 0.138s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.139s\n-------------------------------------\nBayesianRidge --&gt; r2: 0.6028\n</code></pre>"}, {"location": "API/models/br/#hyperparameters", "title": "Hyperparameters", "text": "<p>Parametersn_iterIntDistribution(high=1000, log=False, low=100, step=10)alpha_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)alpha_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_1FloatDistribution(high=1.0, log=True, low=0.0001, step=None)lambda_2FloatDistribution(high=1.0, log=True, low=0.0001, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/br/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/br/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/br/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/br/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/catb/", "title": "CatBoost", "text": "<p>CatB needs scaling accept sparse allows validation supports acceleration</p> <p>CatBoost is a machine learning method based on gradient boosting over decision trees. Main advantages of CatBoost:</p> <ul> <li>Superior quality when compared with other GBDT models on many   datasets.</li> <li>Best in class prediction speed.</li> </ul> <p>Corresponding estimators are:</p> <ul> <li>CatBoostClassifier for classification tasks.</li> <li>CatBoostRegressor for regression tasks.</li> </ul> <p>Read more in CatBoost's documentation.</p> <p>Warning</p> <ul> <li>CatBoost selects the weights achieved by the best evaluation   on the test set after training. This means that, by default,   there is some minor data leakage in the test set. Use the   <code>use_best_model=False</code> parameter to avoid this behavior or use   a holdout set to evaluate the final estimator.</li> <li>In-training validation and pruning are disabled when   <code>device=\"gpu\"</code>.</li> </ul> <p>Note</p> <p>ATOM uses CatBoost's <code>n_estimators</code> parameter instead of <code>iterations</code> to indicate the number of trees to fit. This is done to have consistent naming with the XGBoost and LightGBM models.</p> <p></p> <p>See Also</p> <p>GradientBoostingMachine Gradient Boosting Machine.</p> <p>LightGBM Light Gradient Boosting Machine.</p> <p>XGBoost Extreme Gradient Boosting.</p> <p></p>"}, {"location": "API/models/catb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"CatB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: CatB\nMetric: f1\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9655\nTime elapsed: 14.218s\n-------------------------------------------------\nTime: 14.218s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 14.221s\n-------------------------------------\nCatBoost --&gt; f1: 0.9655\n</code></pre>"}, {"location": "API/models/catb/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)bootstrap_typeCategoricalDistribution(choices=('Bayesian', 'Bernoulli'))bagging_temperatureFloatDistribution(high=10.0, log=False, low=0.0, step=None)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.001, step=None)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)bootstrap_typeCategoricalDistribution(choices=('Bayesian', 'Bernoulli'))bagging_temperatureFloatDistribution(high=10.0, log=False, low=0.0, step=None)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.001, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/catb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/catb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/catb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/catb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/catnb/", "title": "CategoricalNB", "text": "<p>CatNB accept sparse supports acceleration</p> <p>Categorical Naive Bayes implements the Naive Bayes algorithm for categorical features.</p> <p>Corresponding estimators are:</p> <ul> <li>CategoricalNB for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>BernoulliNB Bernoulli Naive Bayes.</p> <p>ComplementNB Complement Naive Bayes.</p> <p>GaussianNB Gaussian Naive Bayes.</p> <p></p>"}, {"location": "API/models/catnb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; import numpy as np\n\n&gt;&gt;&gt; X = np.random.randint(5, size=(100, 100))\n&gt;&gt;&gt; y = np.random.randint(2, size=100)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"CatNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: CatNB\nMetric: f1\n\n\nResults for CategoricalNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.4444\nTime elapsed: 0.029s\n-------------------------------------------------\nTime: 0.029s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.032s\n-------------------------------------\nCategoricalNB --&gt; f1: 0.4444 ~\n</code></pre>"}, {"location": "API/models/catnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/catnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/catnb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/catnb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/catnb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/cnb/", "title": "ComplementNB", "text": "<p>CNB accept sparse supports acceleration</p> <p>The Complement Naive Bayes classifier was designed to correct the \"severe assumptions\" made by the standard MultinomialNB classifier. It is particularly suited for imbalanced datasets.</p> <p>Corresponding estimators are:</p> <ul> <li>ComplementNB for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>BernoulliNB Bernoulli Naive Bayes.</p> <p>CategoricalNB Categorical Naive Bayes.</p> <p>MultinomialNB Multinomial Naive Bayes.</p> <p></p>"}, {"location": "API/models/cnb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"CNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: CNB\nMetric: f1\n\n\nResults for ComplementNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9221\nTest evaluation --&gt; f1: 0.9128\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.023s\n-------------------------------------\nComplementNB --&gt; f1: 0.9128\n</code></pre>"}, {"location": "API/models/cnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))normCategoricalDistribution(choices=(True, False))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))normCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/cnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/cnb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/cnb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/cnb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/dummy/", "title": "Dummy", "text": "<p>Dummy</p> <p>When doing supervised learning, a simple sanity check consists of comparing one's estimator against simple rules of thumb. The prediction methods completely ignore the input data. Do not use this model for real problems. Use it only as a simple baseline to compare with other models.</p> <p>Corresponding estimators are:</p> <ul> <li>DummyClassifier for classification tasks.</li> <li>DummyRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>DecisionTree Single Decision Tree.</p> <p>ExtraTree Extremely Randomized Tree.</p> <p>NaiveForecaster Naive Forecaster.</p> <p></p>"}, {"location": "API/models/dummy/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Dummy\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Dummy\nMetric: f1\n\n\nResults for Dummy:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.7709\nTest evaluation --&gt; f1: 0.7717\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.021s\n-------------------------------------\nDummy --&gt; f1: 0.7717\n</code></pre>"}, {"location": "API/models/dummy/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParametersstrategyCategoricalDistribution(choices=('most_frequent', 'prior', 'stratified', 'uniform'))</p> <p>ParametersstrategyCategoricalDistribution(choices=('mean', 'median', 'quantile'))quantileFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p></p> <p></p>"}, {"location": "API/models/dummy/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/dummy/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/dummy/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/dummy/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/en/", "title": "ElasticNet", "text": "<p>EN needs scaling accept sparse supports acceleration</p> <p>Linear least squares with l1 and l2 regularization.</p> <p>Corresponding estimators are:</p> <ul> <li>ElasticNet for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>Lasso Linear Regression with lasso regularization.</p> <p>OrdinaryLeastSquares Linear Regression.</p> <p>Ridge Linear least squares with l2 regularization.</p> <p></p>"}, {"location": "API/models/en/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"EN\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: EN\nMetric: r2\n\n\nResults for ElasticNet:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.2061\nTest evaluation --&gt; r2: 0.2016\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.139s\n-------------------------------------\nElasticNet --&gt; r2: 0.2016\n</code></pre>"}, {"location": "API/models/en/#hyperparameters", "title": "Hyperparameters", "text": "sklearnsklearnexcuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> cpugpu <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> <p></p> <p></p>"}, {"location": "API/models/en/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/en/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/en/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/en/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/es/", "title": "ExponentialSmoothing", "text": "<p>ES native multioutput</p> <p>Holt-Winters exponential smoothing forecaster. The default settings use simple exponential smoothing, without trend and seasonality components.</p> <p>Corresponding estimators are:</p> <ul> <li>ExponentialSmoothing for forecasting tasks.</li> </ul> <p></p> <p>See Also</p> <p>ARIMA Autoregressive Integrated Moving Average Model.</p> <p>ETS ETS model with automatic fitting capabilities.</p> <p>PolynomialTrend Polynomial Trend forecaster.</p> <p></p>"}, {"location": "API/models/es/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; atom = ATOMForecaster(y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"ES\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: ES\nMetric: mape\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0864\nTest evaluation --&gt; mape: -0.2303\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.020s\n-------------------------------------\nExponentialSmoothing --&gt; mape: -0.2303\n</code></pre>"}, {"location": "API/models/es/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParameterstrendCategoricalDistribution(choices=('add', 'mul', None))damped_trendCategoricalDistribution(choices=(True, False))seasonalCategoricalDistribution(choices=('add', 'mul', None))spCategoricalDistribution(choices=(4, 6, 7, 12, None))use_boxcoxCategoricalDistribution(choices=(True, False))initialization_methodCategoricalDistribution(choices=('estimated', 'heuristic'))methodCategoricalDistribution(choices=('L-BFGS-B', 'TNC', 'SLSQP', 'Powell', 'trust-constr', 'bh', 'ls'))</p> <p></p> <p></p>"}, {"location": "API/models/es/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/es/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/es/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/es/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_interval</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>marginal: bool, default=True Whether returned distribution is marginal by time index. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnssktime.proba.Normal Predicted distribution. </p> <p></p> <p>method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_quantiles</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_residuals</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>y</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_var</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at. <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of y with respect to a ground truth. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/et/", "title": "ExtraTrees", "text": "<p>ET accept sparse native multilabel native multioutput</p> <p>Extra-Trees use a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.</p> <p>Corresponding estimators are:</p> <ul> <li>ExtraTreesClassifier for classification tasks.</li> <li>ExtraTreesRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>DecisionTree Single Decision Tree.</p> <p>ExtraTree Extremely Randomized Tree.</p> <p>RandomForest Random Forest.</p> <p></p>"}, {"location": "API/models/et/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"ET\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: ET\nMetric: f1\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9655\nTime elapsed: 0.110s\n-------------------------------------------------\nTime: 0.110s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.112s\n-------------------------------------\nExtraTrees --&gt; f1: 0.9655\n</code></pre>"}, {"location": "API/models/et/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p></p> <p></p>"}, {"location": "API/models/et/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/et/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/et/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/et/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/etree/", "title": "ExtraTree", "text": "<p>ETree accept sparse native multilabel native multioutput</p> <p>Extra-trees differ from classic decision trees in the way they are built. When looking for the best split to separate the samples of a node into two groups, random splits are drawn for each of the max_features randomly selected features and the best split among those is chosen. When max_features is set 1, this amounts to building a totally random decision tree.</p> <p>Corresponding estimators are:</p> <ul> <li>ExtraTreeClassifier for classification tasks.</li> <li>ExtraTreeRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>DecisionTree Single Decision Tree.</p> <p>ExtraTrees Extremely Randomized Trees.</p> <p>RandomForest Random Forest.</p> <p></p>"}, {"location": "API/models/etree/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"ETree\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: ETree\nMetric: f1\n\n\nResults for ExtraTree:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9241\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.024s\n-------------------------------------\nExtraTree --&gt; f1: 0.9241\n</code></pre>"}, {"location": "API/models/etree/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParameterscriterionCategoricalDistribution(choices=('gini', 'entropy'))splitterCategoricalDistribution(choices=('random', 'best'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>ParameterscriterionCategoricalDistribution(choices=('squared_error', 'absolute_error'))splitterCategoricalDistribution(choices=('random', 'best'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p></p> <p></p>"}, {"location": "API/models/etree/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/etree/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/etree/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/etree/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/ets/", "title": "ETS", "text": "<p>ETS native multioutput</p> <p>The ETS models are a family of time series models with an underlying state space model consisting of a level component, a trend component (T), a seasonal component (S), and an error term (E).</p> <p>Corresponding estimators are:</p> <ul> <li>AutoETS for forecasting tasks.</li> </ul> <p></p> <p>See Also</p> <p>ARIMA Autoregressive Integrated Moving Average Model.</p> <p>ExponentialSmoothing Exponential Smoothing forecaster.</p> <p>PolynomialTrend Polynomial Trend forecaster.</p> <p></p>"}, {"location": "API/models/ets/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; atom = ATOMForecaster(y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"ETS\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: ETS\nMetric: mape\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0858\nTest evaluation --&gt; mape: -0.2305\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.022s\n-------------------------------------\nETS --&gt; mape: -0.2305\n</code></pre>"}, {"location": "API/models/ets/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParameterserrorCategoricalDistribution(choices=('add', 'mul'))trendCategoricalDistribution(choices=('add', 'mul', None))damped_trendCategoricalDistribution(choices=(True, False))seasonalCategoricalDistribution(choices=('add', 'mul', None))spCategoricalDistribution(choices=(1, 4, 6, 7, 12))initialization_methodCategoricalDistribution(choices=('estimated', 'heuristic'))maxiterIntDistribution(high=2000, log=False, low=500, step=100)autoCategoricalDistribution(choices=(True, False))information_criterionCategoricalDistribution(choices=('aic', 'bic', 'aicc'))</p> <p></p> <p></p>"}, {"location": "API/models/ets/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ets/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/ets/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/ets/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_interval</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>marginal: bool, default=True Whether returned distribution is marginal by time index. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnssktime.proba.Normal Predicted distribution. </p> <p></p> <p>method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_quantiles</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_residuals</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>y</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_var</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at. <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of y with respect to a ground truth. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/gbm/", "title": "GradientBoostingMachine", "text": "<p>GBM accept sparse</p> <p>A Gradient Boosting Machine builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage <code>n_classes_</code> regression trees are fit on the negative gradient of the loss function, e.g. binary or multiclass log loss. Binary classification is a special case where only a single regression tree is induced.</p> <p>Corresponding estimators are:</p> <ul> <li>GradientBoostingClassifier for classification tasks.</li> <li>GradientBoostingRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p>Tip</p> <p>HistGradientBoosting is a much faster variant of this algorithm for intermediate datasets (n_samples &gt;= 10k).</p> <p></p> <p>See Also</p> <p>CatBoost Cat Boosting Machine.</p> <p>HistGradientBoosting Histogram-based Gradient Boosting Machine.</p> <p>LightGBM Light Gradient Boosting Machine.</p> <p></p>"}, {"location": "API/models/gbm/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"GBM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: GBM\nMetric: f1\n\n\nResults for GradientBoostingMachine:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9589\nTime elapsed: 0.886s\n-------------------------------------------------\nTime: 0.886s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.890s\n-------------------------------------\nGradientBoostingMachine --&gt; f1: 0.9589\n</code></pre>"}, {"location": "API/models/gbm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParameterslossCategoricalDistribution(choices=('log_loss', 'exponential'))learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)n_estimatorsIntDistribution(high=500, log=False, low=10, step=10)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)criterionCategoricalDistribution(choices=('friedman_mse', 'squared_error'))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_depthIntDistribution(high=21, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>ParameterslossCategoricalDistribution(choices=('squared_error', 'absolute_error', 'huber', 'quantile'))learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)n_estimatorsIntDistribution(high=500, log=False, low=10, step=10)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)criterionCategoricalDistribution(choices=('friedman_mse', 'squared_error'))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_depthIntDistribution(high=21, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)alphaFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)</p> <p></p> <p></p>"}, {"location": "API/models/gbm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/gbm/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/gbm/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/gbm/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/gnb/", "title": "GaussianNB", "text": "<p>GNB supports acceleration</p> <p>Gaussian Naive Bayes implements the Naive Bayes algorithm for classification. The likelihood of the features is assumed to be Gaussian.</p> <p>Corresponding estimators are:</p> <ul> <li>GaussianNB for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>BernoulliNB Bernoulli Naive Bayes.</p> <p>CategoricalNB Categorical Naive Bayes.</p> <p>ComplementNB Complement Naive Bayes.</p> <p></p>"}, {"location": "API/models/gnb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"GNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: GNB\nMetric: f1\n\n\nResults for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9553\nTest evaluation --&gt; f1: 0.9371\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.022s\n-------------------------------------\nGaussianNB --&gt; f1: 0.9371\n</code></pre>"}, {"location": "API/models/gnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/gnb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/gnb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/gnb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/gp/", "title": "GaussianProcess", "text": "<p>GP</p> <p>Gaussian Processes are a generic supervised learning method designed to solve regression and probabilistic classification problems. The advantages of Gaussian processes are:</p> <ul> <li>The prediction interpolates the observations.</li> <li>The prediction is probabilistic (Gaussian) so that one can compute   empirical confidence intervals and decide based on those if one   should refit (online fitting, adaptive fitting) the prediction in   some region of interest.</li> </ul> <p>The disadvantages of Gaussian processes include:</p> <ul> <li>They are not sparse, i.e., they use the whole samples/features   information to perform the prediction.</li> <li>They lose efficiency in high dimensional spaces, namely when the   number of features exceeds a few dozens.</li> </ul> <p>Corresponding estimators are:</p> <ul> <li>GaussianProcessClassifier for classification tasks.</li> <li>GaussianProcessRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>GaussianNB Gaussian Naive Bayes.</p> <p>LinearDiscriminantAnalysis Linear Discriminant Analysis.</p> <p>PassiveAggressive Passive Aggressive.</p> <p></p>"}, {"location": "API/models/gp/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"GP\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: GP\nMetric: f1\n\n\nResults for GaussianProcess:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9437\nTime elapsed: 0.105s\n-------------------------------------------------\nTime: 0.105s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.109s\n-------------------------------------\nGaussianProcess --&gt; f1: 0.9437\n</code></pre>"}, {"location": "API/models/gp/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/gp/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/gp/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/gp/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/hgbm/", "title": "HistGradientBoosting", "text": "<p>hGBM</p> <p>This Histogram-based Gradient Boosting Machine is much faster than the standard GradientBoostingMachine for big datasets (n_samples&gt;=10k). This variation first bins the input samples into integer-valued bins which tremendously reduces the number of splitting points to consider, and allows the algorithm to leverage integer-based data structures (histograms) instead of relying on sorted continuous values when building the trees.</p> <p>Corresponding estimators are:</p> <ul> <li>HistGradientBoostingClassifier for classification tasks.</li> <li>HistGradientBoostingRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>CatBoost Cat Boosting Machine.</p> <p>GradientBoostingMachine Gradient Boosting Machine.</p> <p>XGBoost Extreme Gradient Boosting.</p> <p></p>"}, {"location": "API/models/hgbm/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"hGBM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: hGBM\nMetric: f1\n\n\nResults for HistGradientBoosting:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9583\nTime elapsed: 0.357s\n-------------------------------------------------\nTime: 0.357s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.360s\n-------------------------------------\nHistGradientBoosting --&gt; f1: 0.9583\n</code></pre>"}, {"location": "API/models/hgbm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parameterslearning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_iterIntDistribution(high=500, log=False, low=10, step=10)max_leaf_nodesIntDistribution(high=50, log=False, low=10, step=1)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_leafIntDistribution(high=30, log=False, low=10, step=1)l2_regularizationFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p>ParameterslossCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson', 'quantile', 'gamma'))quantileFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_iterIntDistribution(high=500, log=False, low=10, step=10)max_leaf_nodesIntDistribution(high=50, log=False, low=10, step=1)max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_leafIntDistribution(high=30, log=False, low=10, step=1)l2_regularizationFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p></p> <p></p>"}, {"location": "API/models/hgbm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/hgbm/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/hgbm/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/hgbm/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/huber/", "title": "HuberRegression", "text": "<p>Huber needs scaling</p> <p>Huber is a linear regression model that is robust to outliers. It makes sure that the loss function is not heavily influenced by the outliers while not completely ignoring their effect.</p> <p>Corresponding estimators are:</p> <ul> <li>HuberRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>AutomaticRelevanceDetermination Automatic Relevance Determination.</p> <p>LeastAngleRegression Least Angle Regression.</p> <p>OrdinaryLeastSquares Linear Regression.</p> <p></p>"}, {"location": "API/models/huber/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Huber\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Huber\nMetric: r2\n\n\nResults for HuberRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.546\nTest evaluation --&gt; r2: 0.5999\nTime elapsed: 0.187s\n-------------------------------------------------\nTime: 0.187s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.187s\n-------------------------------------\nHuberRegression --&gt; r2: 0.5999\n</code></pre>"}, {"location": "API/models/huber/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParametersepsilonFloatDistribution(high=10.0, log=True, low=1.0, step=None)max_iterIntDistribution(high=500, log=False, low=50, step=10)alphaFloatDistribution(high=1.0, log=True, low=0.0001, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/huber/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/huber/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/huber/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/huber/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/knn/", "title": "KNearestNeighbors", "text": "<p>KNN needs scaling accept sparse native multilabel native multioutput supports acceleration</p> <p>K-Nearest Neighbors, as the name clearly indicates, implements the k-nearest neighbors vote. For regression, the target is predicted by local interpolation of the targets associated of the nearest neighbors in the training set.</p> <p>Corresponding estimators are:</p> <ul> <li>KNeighborsClassifier for classification tasks.</li> <li>KNeighborsRegressor for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>LinearDiscriminantAnalysis Linear Discriminant Analysis.</p> <p>QuadraticDiscriminantAnalysis Quadratic Discriminant Analysis.</p> <p>RadiusNearestNeighbors Radius Nearest Neighbors.</p> <p></p>"}, {"location": "API/models/knn/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"KNN\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: KNN\nMetric: f1\n\n\nResults for KNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.981\nTest evaluation --&gt; f1: 0.9793\nTime elapsed: 0.116s\n-------------------------------------------------\nTime: 0.116s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.119s\n-------------------------------------\nKNearestNeighbors --&gt; f1: 0.9793\n</code></pre>"}, {"location": "API/models/knn/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> cpugpu <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> sklearnsklearnexcuml <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> cpugpu <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p>Parametersn_neighborsIntDistribution(high=100, log=False, low=1, step=1)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p></p> <p></p>"}, {"location": "API/models/knn/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/knn/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/knn/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/knn/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/lars/", "title": "LeastAngleRegression", "text": "<p>Lars needs scaling</p> <p>Least-Angle Regression is a regression algorithm for high-dimensional data. Lars is similar to forward stepwise regression. At each step, it finds the feature most correlated with the target. When there are multiple features having equal correlation, instead of continuing along the same feature, it proceeds in a direction equiangular between the features.</p> <p>Corresponding estimators are:</p> <ul> <li>Lars for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>BayesianRidge Bayesian ridge regression.</p> <p>HuberRegression Huber regressor.</p> <p>OrdinaryLeastSquares Linear Regression.</p> <p></p>"}, {"location": "API/models/lars/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Lars\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Lars\nMetric: r2\n\n\nResults for LeastAngleRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6067\nTest evaluation --&gt; r2: 0.6028\nTime elapsed: 0.136s\n-------------------------------------------------\nTime: 0.136s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.137s\n-------------------------------------\nLeastAngleRegression --&gt; r2: 0.6028\n</code></pre>"}, {"location": "API/models/lars/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lars/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/lars/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/lars/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/lasso/", "title": "Lasso", "text": "<p>Lasso needs scaling accept sparse supports acceleration</p> <p>Linear least squares with l1 regularization.</p> <p>Corresponding estimators are:</p> <ul> <li>Lasso for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>ElasticNet Linear Regression with elasticnet regularization.</p> <p>OrdinaryLeastSquares Linear Regression.</p> <p>Ridge Linear least squares with l2 regularization.</p> <p></p>"}, {"location": "API/models/lasso/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Lasso\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Lasso\nMetric: r2\n\n\nResults for Lasso:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.0\nTest evaluation --&gt; r2: -0.0001\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.139s\n-------------------------------------\nLasso --&gt; r2: -0.0001 ~\n</code></pre>"}, {"location": "API/models/lasso/#hyperparameters", "title": "Hyperparameters", "text": "sklearnsklearnexcuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> cpugpu <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)selectionCategoricalDistribution(choices=('cyclic', 'random'))</p> <p></p> <p></p>"}, {"location": "API/models/lasso/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lasso/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/lasso/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/lasso/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/lda/", "title": "LinearDiscriminantAnalysis", "text": "<p>LDA</p> <p>Linear Discriminant Analysis is a classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes\u2019 rule. The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix.</p> <p>Corresponding estimators are:</p> <ul> <li>LinearDiscriminantAnalysis for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>LogisticRegression Logistic Regression.</p> <p>RadiusNearestNeighbors Radius Nearest Neighbors.</p> <p>QuadraticDiscriminantAnalysis Quadratic Discriminant Analysis.</p> <p></p>"}, {"location": "API/models/lda/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"LDA\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: LDA\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9743\nTest evaluation --&gt; f1: 0.9726\nTime elapsed: 0.025s\n-------------------------------------------------\nTime: 0.025s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.028s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.9726\n</code></pre>"}, {"location": "API/models/lda/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParameterssolverCategoricalDistribution(choices=('svd', 'lsqr', 'eigen'))shrinkageCategoricalDistribution(choices=(None, 'auto', 0.5, 0.6, 0.7, 0.8, 0.9, 1.0))</p> <p></p> <p></p>"}, {"location": "API/models/lda/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lda/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/lda/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/lda/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/lgb/", "title": "LightGBM", "text": "<p>LGB needs scaling accept sparse allows validation supports acceleration</p> <p>LightGBM is a gradient boosting model that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:</p> <ul> <li>Faster training speed and higher efficiency.</li> <li>Lower memory usage.</li> <li>Better accuracy.</li> <li>Capable of handling large-scale data.</li> </ul> <p>Corresponding estimators are:</p> <ul> <li>LGBMClassifier for classification tasks.</li> <li>LGBMRegressor for regression tasks.</li> </ul> <p>Read more in LightGBM's documentation.</p> <p>Info</p> <p>Using LightGBM's GPU acceleration requires additional software dependencies.</p> <p></p> <p>See Also</p> <p>CatBoost Cat Boosting Machine.</p> <p>GradientBoostingMachine Gradient Boosting Machine.</p> <p>XGBoost Extreme Gradient Boosting.</p> <p></p>"}, {"location": "API/models/lgb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"LGB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: LGB\nMetric: f1\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9583\nTime elapsed: 0.426s\n-------------------------------------------------\nTime: 0.426s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.429s\n-------------------------------------\nLightGBM --&gt; f1: 0.9583\n</code></pre>"}, {"location": "API/models/lgb/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=17, log=False, low=-1, step=2)num_leavesIntDistribution(high=40, log=False, low=20, step=1)min_child_weightFloatDistribution(high=100.0, log=True, low=0.0001, step=None)min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=17, log=False, low=-1, step=2)num_leavesIntDistribution(high=40, log=False, low=20, step=1)min_child_weightFloatDistribution(high=100.0, log=True, low=0.0001, step=None)min_child_samplesIntDistribution(high=30, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/lgb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lgb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/lgb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/lgb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/lr/", "title": "LogisticRegression", "text": "<p>LR needs scaling accept sparse supports acceleration</p> <p>Logistic regression, despite its name, is a linear model for classification rather than regression. Logistic regression is also known in the literature as logit regression, maximum-entropy classification (MaxEnt) or the log-linear classifier. In this model, the probabilities describing the possible outcomes of a single trial are modeled using a logistic function.</p> <p>Corresponding estimators are:</p> <ul> <li>LogisticRegression for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>GaussianProcess Gaussian process.</p> <p>LinearDiscriminantAnalysis Linear Discriminant Analysis.</p> <p>PassiveAggressive Passive Aggressive.</p> <p></p>"}, {"location": "API/models/lr/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"RF\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9524\nTime elapsed: 0.229s\n-------------------------------------------------\nTime: 0.229s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.232s\n-------------------------------------\nRandomForest --&gt; f1: 0.9524\n</code></pre>"}, {"location": "API/models/lr/#hyperparameters", "title": "Hyperparameters", "text": "sklearnsklearnexcuml <p>ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> cpugpu <p>ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p>ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p>ParameterspenaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'))max_iterIntDistribution(high=1000, log=False, low=100, step=10)l1_ratioFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p></p> <p></p>"}, {"location": "API/models/lr/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lr/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/lr/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/lr/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/lsvm/", "title": "LinearSVM", "text": "<p>lSVM needs scaling accept sparse supports acceleration</p> <p>Similar to SupportVectorMachine but with a linear kernel. Implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.</p> <p>Corresponding estimators are:</p> <ul> <li>LinearSVC for classification tasks.</li> <li>LinearSVR for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>KNearestNeighbors K-Nearest Neighbors.</p> <p>StochasticGradientDescent Stochastic Gradient Descent.</p> <p>SupportVectorMachine Support Vector Machine.</p> <p></p>"}, {"location": "API/models/lsvm/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"lSVM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: lSVM\nMetric: f1\n\n\nResults for LinearSVM:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.993\nTest evaluation --&gt; f1: 0.9722\nTime elapsed: 0.089s\n-------------------------------------------------\nTime: 0.089s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.092s\n-------------------------------------\nLinearSVM --&gt; f1: 0.9722\n</code></pre>"}, {"location": "API/models/lsvm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearncuml <p>ParameterspenaltyCategoricalDistribution(choices=('l1', 'l2'))lossCategoricalDistribution(choices=('hinge', 'squared_hinge'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))</p> <p>ParameterspenaltyCategoricalDistribution(choices=('l1', 'l2'))lossCategoricalDistribution(choices=('hinge', 'squared_hinge'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))</p> sklearncuml <p>ParameterslossCategoricalDistribution(choices=('epsilon_insensitive', 'squared_epsilon_insensitive'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))</p> <p>ParameterslossCategoricalDistribution(choices=('epsilon_insensitive', 'squared_epsilon_insensitive'))CFloatDistribution(high=100.0, log=True, low=0.001, step=None)dualCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/lsvm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/lsvm/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/lsvm/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/lsvm/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/mlp/", "title": "MultiLayerPerceptron", "text": "<p>MLP needs scaling accept sparse native multilabel allows validation</p> <p>Multi-layer Perceptron is a supervised learning algorithm that learns a function by training on a dataset. Given a set of features and a target, it can learn a non-linear function approximator for either classification or regression. It is different from logistic regression, in that between the input and the output layer, there can be one or more non-linear layers, called hidden layers.</p> <p>Corresponding estimators are:</p> <ul> <li>MLPClassifier for classification tasks.</li> <li>MLPRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>PassiveAggressive Passive Aggressive.</p> <p>Perceptron Linear Perceptron classification.</p> <p>StochasticGradientDescent Stochastic Gradient Descent.</p> <p></p>"}, {"location": "API/models/mlp/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"MLP\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: MLP\nMetric: f1\n\n\nResults for MultiLayerPerceptron:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9965\nTest evaluation --&gt; f1: 0.979\nTime elapsed: 1.783s\n-------------------------------------------------\nTime: 1.783s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.786s\n-------------------------------------\nMultiLayerPerceptron --&gt; f1: 0.979\n</code></pre>"}, {"location": "API/models/mlp/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametershidden_layer_1IntDistribution(high=100, log=False, low=10, step=1)hidden_layer_2IntDistribution(high=100, log=False, low=0, step=1)hidden_layer_3IntDistribution(high=10, log=False, low=0, step=1)activationCategoricalDistribution(choices=('identity', 'logistic', 'tanh', 'relu'))solverCategoricalDistribution(choices=('lbfgs', 'sgd', 'adam'))alphaFloatDistribution(high=0.1, log=True, low=0.0001, step=None)batch_sizeCategoricalDistribution(choices=('auto', 8, 16, 32, 64, 128, 256))learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'adaptive'))learning_rate_initFloatDistribution(high=0.1, log=True, low=0.001, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=500, log=False, low=50, step=10)</p> <p>Parametershidden_layer_1IntDistribution(high=100, log=False, low=10, step=1)hidden_layer_2IntDistribution(high=100, log=False, low=0, step=1)hidden_layer_3IntDistribution(high=10, log=False, low=0, step=1)activationCategoricalDistribution(choices=('identity', 'logistic', 'tanh', 'relu'))solverCategoricalDistribution(choices=('lbfgs', 'sgd', 'adam'))alphaFloatDistribution(high=0.1, log=True, low=0.0001, step=None)batch_sizeCategoricalDistribution(choices=('auto', 8, 16, 32, 64, 128, 256))learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'adaptive'))learning_rate_initFloatDistribution(high=0.1, log=True, low=0.001, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=500, log=False, low=50, step=10)</p> <p></p> <p></p>"}, {"location": "API/models/mlp/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/mlp/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/mlp/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/mlp/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/mnb/", "title": "MultinomialNB", "text": "<p>MNB accept sparse supports acceleration</p> <p>MultinomialNB implements the Naive Bayes algorithm for multinomially distributed data, and is one of the two classic Naive Bayes variants used in text classification (where the data are typically represented as word vector counts, although tf-idf vectors are also known to work well in practice).</p> <p>Corresponding estimators are:</p> <ul> <li>MultinomialNB for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>BernoulliNB Bernoulli Naive Bayes.</p> <p>ComplementNB Complement Naive Bayes.</p> <p>GaussianNB Gaussian Naive Bayes.</p> <p></p>"}, {"location": "API/models/mnb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"MNB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: MNB\nMetric: f1\n\n\nResults for MultinomialNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9238\nTest evaluation --&gt; f1: 0.9128\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.024s\n-------------------------------------\nMultinomialNB --&gt; f1: 0.9128\n</code></pre>"}, {"location": "API/models/mnb/#hyperparameters", "title": "Hyperparameters", "text": "sklearncuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.01, step=None)fit_priorCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/mnb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/mnb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/mnb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/mnb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/nf/", "title": "NaiveForecaster", "text": "<p>NF native multioutput</p> <p>NaiveForecaster is a dummy forecaster that makes forecasts using simple strategies based on naive assumptions about past trends continuing. When used in multivariate tasks, each column is forecasted with the same strategy.</p> <p>Corresponding estimators are:</p> <ul> <li>NaiveForecaster for forecasting tasks.</li> </ul> <p></p> <p>See Also</p> <p>ExponentialSmoothing Exponential Smoothing forecaster.</p> <p>Dummy Dummy classifier/regressor.</p> <p>PolynomialTrend Polynomial Trend forecaster.</p> <p></p>"}, {"location": "API/models/nf/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; atom = ATOMForecaster(y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"NF\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: NF\nMetric: mape\n\n\nResults for NaiveForecaster:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0858\nTest evaluation --&gt; mape: -0.2305\nTime elapsed: 0.022s\n-------------------------------------------------\nTime: 0.022s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.023s\n-------------------------------------\nNaiveForecaster --&gt; mape: -0.2305\n</code></pre>"}, {"location": "API/models/nf/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParametersstrategyCategoricalDistribution(choices=('last', 'mean', 'drift'))</p> <p></p> <p></p>"}, {"location": "API/models/nf/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/nf/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/nf/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/nf/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_interval</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>marginal: bool, default=True Whether returned distribution is marginal by time index. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnssktime.proba.Normal Predicted distribution. </p> <p></p> <p>method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_quantiles</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_residuals</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>y</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_var</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at. <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of y with respect to a ground truth. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/ols/", "title": "OrdinaryLeastSquares", "text": "<p>OLS needs scaling accept sparse supports acceleration</p> <p>Ordinary Least Squares is just linear regression without any regularization. It fits a linear model with coefficients <code>w=(w1,  ..., wp)</code> to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation.</p> <p>Corresponding estimators are:</p> <ul> <li>LinearRegression for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>ElasticNet Linear Regression with elasticnet regularization.</p> <p>Lasso Linear Regression with lasso regularization.</p> <p>Ridge Linear least squares with l2 regularization.</p> <p></p>"}, {"location": "API/models/ols/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"OLS\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: OLS\nMetric: r2\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6067\nTest evaluation --&gt; r2: 0.6028\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.138s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: 0.6028\n</code></pre>"}, {"location": "API/models/ols/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ols/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/ols/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/ols/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/omp/", "title": "OrthogonalMatchingPursuit", "text": "<p>OMP needs scaling</p> <p>Orthogonal Matching Pursuit implements the OMP algorithm for approximating the fit of a linear model with constraints imposed on the number of non-zero coefficients.</p> <p>Corresponding estimators are:</p> <ul> <li>OrthogonalMatchingPursuit for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>Lasso Linear Regression with lasso regularization.</p> <p>LeastAngleRegression Least Angle Regression.</p> <p>OrdinaryLeastSquares Linear Regression.</p> <p></p>"}, {"location": "API/models/omp/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"OMP\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: OMP\nMetric: r2\n\n\nResults for OrthogonalMatchingPursuit:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.4751\nTest evaluation --&gt; r2: 0.4668\nTime elapsed: 0.135s\n-------------------------------------------------\nTime: 0.135s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.136s\n-------------------------------------\nOrthogonalMatchingPursuit --&gt; r2: 0.4668\n</code></pre>"}, {"location": "API/models/omp/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/omp/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/omp/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/omp/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/pa/", "title": "PassiveAggressive", "text": "<p>PA needs scaling accept sparse allows validation</p> <p>The passive-aggressive algorithms are a family of algorithms for large-scale learning. They are similar to the Perceptron in that they do not require a learning rate. However, contrary to the Perceptron, they include a regularization parameter <code>C</code>.</p> <p>Corresponding estimators are:</p> <ul> <li>PassiveAggressiveClassifier for classification tasks.</li> <li>PassiveAggressiveRegressor for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>MultiLayerPerceptron Multi-layer Perceptron.</p> <p>Perceptron Linear Perceptron classification.</p> <p>StochasticGradientDescent Stochastic Gradient Descent.</p> <p></p>"}, {"location": "API/models/pa/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"PA\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: PA\nMetric: f1\n\n\nResults for PassiveAggressive:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9965\nTest evaluation --&gt; f1: 0.9504\nTime elapsed: 5.512s\n-------------------------------------------------\nTime: 5.512s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.515s\n-------------------------------------\nPassiveAggressive --&gt; f1: 0.9504\n</code></pre>"}, {"location": "API/models/pa/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)max_iterIntDistribution(high=1500, log=False, low=500, step=50)lossCategoricalDistribution(choices=('hinge', 'squared_hinge'))averageCategoricalDistribution(choices=(True, False))</p> <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)max_iterIntDistribution(high=1500, log=False, low=500, step=50)lossCategoricalDistribution(choices=('epsilon_insensitive', 'squared_epsilon_insensitive'))averageCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/pa/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/pa/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/pa/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/pa/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/perc/", "title": "Perceptron", "text": "<p>Perc needs scaling allows validation</p> <p>The Perceptron is a simple classification algorithm suitable for large scale learning. By default:</p> <ul> <li>It does not require a learning rate.</li> <li>It is not regularized (penalized).</li> <li>It updates its model only on mistakes.</li> </ul> <p>The last characteristic implies that the Perceptron is slightly faster to train than StochasticGradientDescent with the hinge loss and that the resulting models are sparser.</p> <p>Corresponding estimators are:</p> <ul> <li>Perceptron for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>MultiLayerPerceptron Multi-layer Perceptron.</p> <p>PassiveAggressive Passive Aggressive.</p> <p>StochasticGradientDescent Stochastic Gradient Descent.</p> <p></p>"}, {"location": "API/models/perc/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Perc\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Perc\nMetric: f1\n\n\nResults for Perceptron:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9577\nTime elapsed: 5.509s\n-------------------------------------------------\nTime: 5.509s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.512s\n-------------------------------------\nPerceptron --&gt; f1: 0.9577\n</code></pre>"}, {"location": "API/models/perc/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParameterspenaltyCategoricalDistribution(choices=(None, 'l2', 'l1', 'elasticnet'))alphaFloatDistribution(high=10.0, log=True, low=0.0001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=1500, log=False, low=500, step=50)eta0FloatDistribution(high=10.0, log=True, low=0.01, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/perc/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/perc/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/perc/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/perc/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/pt/", "title": "PolynomialTrend", "text": "<p>PT native multioutput</p> <p>Forecast time series data with a polynomial trend, using a sklearn LinearRegression class to regress values of time series on index, after extraction of polynomial features.</p> <p>Corresponding estimators are:</p> <ul> <li>PolynomialTrendForecaster for forecasting tasks.</li> </ul> <p></p> <p>See Also</p> <p>ARIMA Autoregressive Integrated Moving Average Model.</p> <p>ETS ETS model with automatic fitting capabilities.</p> <p>NaiveForecaster Naive Forecaster.</p> <p></p>"}, {"location": "API/models/pt/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; atom = ATOMForecaster(y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"PT\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: PT\nMetric: mape\n\n\nResults for PolynomialTrend:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.1196\nTest evaluation --&gt; mape: -0.1181\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.019s\n-------------------------------------\nPolynomialTrend --&gt; mape: -0.1181\n</code></pre>"}, {"location": "API/models/pt/#hyperparameters", "title": "Hyperparameters", "text": "<p>ParametersdegreeIntDistribution(high=5, log=False, low=1, step=1)with_interceptCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/pt/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/pt/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/pt/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/pt/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_residualsGet residuals of forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(fh, X=None, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_interval(fh, X=None, coverage=0.9, verbose=None)[source]Get prediction intervals on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_interval</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>coverage: float or sequence, default=0.9 Nominal coverage(s) of predictive interval(s). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, 2) or shape=(n_samples, 2 * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_proba(fh, X=None, marginal=True, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>marginal: bool, default=True Whether returned distribution is marginal by time index. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnssktime.proba.Normal Predicted distribution. </p> <p></p> <p>method predict_quantiles(fh, X=None, alpha=[0.05, 0.95], verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_quantiles</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>alpha: float or list of float, default=[0.05, 0.95] A probability or list of, at which quantile forecasts are computed. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples, len(alpha)) or shape=(n_samples, len(alpha) * n_targets) for multivariate tasks. </p> <p></p> <p>method predict_residuals(y, X=None, verbose=None)[source]Get residuals of forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_residuals</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>y</code>. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method predict_var(fh, X=None, cov=False, verbose=None)[source]Get probabilistic forecasts on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_var</code> method.</p> <p>Read more in the user guide.</p> <p>Parametersfh: int, range, sequence or ForecastingHorizon The forecasting horizon encoding the time stamps to forecast at. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>cov: bool, default=False Whether to compute covariance matrix forecast or marginal variance forecasts. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multivariate tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(y, X=None, fh=None, metric=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sktime's score method for estimators.</p> <p>Parametersy: int, str, dict, sequence or dataframe Ground truth observations. <p>X: hashable, segment, sequence, dataframe-like or None, default=None Exogenous time series corresponding to <code>fh</code>. <p>fh: int, sequence or ForecastingHorizon or None, default=None The forecasting horizon encoding the time stamps to forecast at. <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of y with respect to a ground truth. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/qda/", "title": "QuadraticDiscriminantAnalysis", "text": "<p>QDA</p> <p>Quadratic Discriminant Analysis is a classifier with a quadratic decision boundary, generated by fitting class conditional densities to the data and using Bayes\u2019 rule. The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix.</p> <p>Corresponding estimators are:</p> <ul> <li>QuadraticDiscriminantAnalysis for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>LinearDiscriminantAnalysis Linear Discriminant Analysis.</p> <p>LogisticRegression Logistic Regression.</p> <p>RadiusNearestNeighbors Radius Nearest Neighbors.</p> <p></p>"}, {"location": "API/models/qda/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"QDA\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: QDA\nMetric: f1\n\n\nResults for QuadraticDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9809\nTest evaluation --&gt; f1: 0.9504\nTime elapsed: 0.023s\n-------------------------------------------------\nTime: 0.023s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.026s\n-------------------------------------\nQuadraticDiscriminantAnalysis --&gt; f1: 0.9504\n</code></pre>"}, {"location": "API/models/qda/#hyperparameters", "title": "Hyperparameters", "text": "<p>Parametersreg_paramFloatDistribution(high=1.0, log=False, low=0.0, step=0.1)</p> <p></p> <p></p>"}, {"location": "API/models/qda/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/qda/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/qda/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/qda/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/rf/", "title": "RandomForest", "text": "<p>RF accept sparse native multilabel native multioutput supports acceleration</p> <p>Random forests are an ensemble learning method that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. Random forests correct for decision trees' habit of overfitting to their training set.</p> <p>Corresponding estimators are:</p> <ul> <li>RandomForestClassifier for classification tasks.</li> <li>RandomForestRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p>Warning</p> <p>cuML's implementation of RandomForestClassifier only supports predictions on dtype <code>float32</code>. Convert all dtypes before calling atom's run method to avoid exceptions.</p> <p></p> <p>See Also</p> <p>DecisionTree Single Decision Tree.</p> <p>ExtraTrees Extremely Randomized Trees.</p> <p>HistGradientBoosting Histogram-based Gradient Boosting Machine.</p> <p></p>"}, {"location": "API/models/rf/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"RF\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: RF\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9524\nTime elapsed: 0.232s\n-------------------------------------------------\nTime: 0.232s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.236s\n-------------------------------------\nRandomForest --&gt; f1: 0.9524\n</code></pre>"}, {"location": "API/models/rf/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> cpugpu <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('gini', 'entropy'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> sklearnsklearnexcuml <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> cpugpu <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=10, step=10)criterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'poisson'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))bootstrapCategoricalDistribution(choices=(True, False))max_samplesCategoricalDistribution(choices=(None, 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p></p> <p></p>"}, {"location": "API/models/rf/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/rf/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/rf/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/rf/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/ridge/", "title": "Ridge", "text": "<p>Ridge needs scaling accept sparse native multilabel supports acceleration</p> <p>If classifier, it first converts the target values into {-1, 1} and then treats the problem as a regression task.</p> <p>Corresponding estimators are:</p> <ul> <li>RidgeClassifier for classification tasks.</li> <li>Ridge for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p>Warning</p> <p>Engines <code>sklearnex</code> and <code>cuml</code> are only available for regression tasks.</p> <p></p> <p>See Also</p> <p>BayesianRidge Bayesian ridge regression.</p> <p>ElasticNet Linear Regression with elasticnet regularization.</p> <p>Lasso Linear Regression with lasso regularization.</p> <p></p>"}, {"location": "API/models/ridge/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import fetch_california_housing\n\n&gt;&gt;&gt; X, y = fetch_california_housing(return_X_y=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Ridge\", metric=\"r2\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Ridge\nMetric: r2\n\n\nResults for Ridge:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6067\nTest evaluation --&gt; r2: 0.6028\nTime elapsed: 0.136s\n-------------------------------------------------\nTime: 0.136s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.137s\n-------------------------------------\nRidge --&gt; r2: 0.6028\n</code></pre>"}, {"location": "API/models/ridge/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> cpugpu <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> sklearnsklearnexcuml <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> cpugpu <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> <p>ParametersalphaFloatDistribution(high=10.0, log=True, low=0.001, step=None)solverCategoricalDistribution(choices=('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'))</p> <p></p> <p></p>"}, {"location": "API/models/ridge/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/ridge/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/ridge/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/ridge/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/rnn/", "title": "RadiusNearestNeighbors", "text": "<p>RNN needs scaling accept sparse native multilabel native multioutput</p> <p>Radius Nearest Neighbors implements the nearest neighbors vote, where the neighbors are selected from within a given radius. For regression, the target is predicted by local interpolation of the targets associated of the nearest neighbors in the training set.</p> <p>Warning</p> <ul> <li>The <code>radius</code> parameter should be tuned to the data at hand or   the model will perform poorly.</li> <li>If outliers are detected, the estimator raises an exception   unless <code>est_params={\"outlier_label\": \"most_frequent\"}</code> is used.</li> </ul> <p>Corresponding estimators are:</p> <ul> <li>RadiusNeighborsClassifier for classification tasks.</li> <li>RadiusNeighborsRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>KNearestNeighbors K-Nearest Neighbors.</p> <p>LinearDiscriminantAnalysis Linear Discriminant Analysis.</p> <p>QuadraticDiscriminantAnalysis Quadratic Discriminant Analysis.</p> <p></p>"}, {"location": "API/models/rnn/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\n...     models=\"RNN\",\n...     metric=\"f1\",\n...     est_params={\"outlier_label\": \"most_frequent\"},\n...     verbose=2,\n... )\n\n\nTraining ========================= &gt;&gt;\nModels: RNN\nMetric: f1\n\n\nResults for RadiusNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.7717\nTime elapsed: 0.091s\n-------------------------------------------------\nTime: 0.091s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.094s\n-------------------------------------\nRadiusNearestNeighbors --&gt; f1: 0.7717 ~\n</code></pre>"}, {"location": "API/models/rnn/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParametersradiusFloatDistribution(high=100.0, log=False, low=0.01, step=None)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p>ParametersradiusFloatDistribution(high=100.0, log=False, low=0.01, step=None)weightsCategoricalDistribution(choices=('uniform', 'distance'))algorithmCategoricalDistribution(choices=('auto', 'ball_tree', 'kd_tree', 'brute'))leaf_sizeIntDistribution(high=40, log=False, low=20, step=1)pIntDistribution(high=2, log=False, low=1, step=1)</p> <p></p> <p></p>"}, {"location": "API/models/rnn/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/rnn/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/rnn/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/rnn/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/sgd/", "title": "StochasticGradientDescent", "text": "<p>SGD needs scaling accept sparse allows validation</p> <p>Stochastic Gradient Descent is a simple yet very efficient approach to fitting linear classifiers and regressors under convex loss functions. Even though SGD has been around in the machine learning community for a long time, it has received a considerable amount of attention just recently in the context of large-scale learning.</p> <p>Corresponding estimators are:</p> <ul> <li>SGDClassifier for classification tasks.</li> <li>SGDRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>MultiLayerPerceptron Multi-layer Perceptron.</p> <p>PassiveAggressive Passive Aggressive.</p> <p>SupportVectorMachine Support Vector Machine.</p> <p></p>"}, {"location": "API/models/sgd/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"SGD\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: SGD\nMetric: f1\n\n\nResults for StochasticGradientDescent:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9948\nTest evaluation --&gt; f1: 0.9722\nTime elapsed: 5.506s\n-------------------------------------------------\nTime: 5.506s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.509s\n-------------------------------------\nStochasticGradientDescent --&gt; f1: 0.9722\n</code></pre>"}, {"location": "API/models/sgd/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParameterslossCategoricalDistribution(choices=('hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'))penaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))alphaFloatDistribution(high=1.0, log=True, low=0.0001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=1500, log=False, low=500, step=50)epsilonFloatDistribution(high=1.0, log=True, low=0.0001, step=None)learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'optimal', 'adaptive'))eta0FloatDistribution(high=10.0, log=True, low=0.01, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)averageCategoricalDistribution(choices=(True, False))</p> <p>ParameterslossCategoricalDistribution(choices=('squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'))penaltyCategoricalDistribution(choices=(None, 'l1', 'l2', 'elasticnet'))alphaFloatDistribution(high=1.0, log=True, low=0.0001, step=None)l1_ratioFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)max_iterIntDistribution(high=1500, log=False, low=500, step=50)epsilonFloatDistribution(high=1.0, log=True, low=0.0001, step=None)learning_rateCategoricalDistribution(choices=('constant', 'invscaling', 'optimal', 'adaptive'))eta0FloatDistribution(high=10.0, log=True, low=0.01, step=None)power_tFloatDistribution(high=0.9, log=False, low=0.1, step=0.1)averageCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/sgd/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/sgd/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/sgd/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/sgd/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/svm/", "title": "SupportVectorMachine", "text": "<p>SVM needs scaling accept sparse supports acceleration</p> <p>The implementation of the Support Vector Machine is based on libsvm. The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples. For large datasets consider using a LinearSVM or a StochasticGradientDescent model instead.</p> <p>Corresponding estimators are:</p> <ul> <li>SVC for classification tasks.</li> <li>SVR for classification tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>LinearSVM Linear Support Vector Machine.</p> <p>MultiLayerPerceptron Multi-layer Perceptron.</p> <p>StochasticGradientDescent Stochastic Gradient Descent.</p> <p></p>"}, {"location": "API/models/svm/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"SVM\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: SVM\nMetric: f1\n\n\nResults for SupportVectorMachine:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9913\nTest evaluation --&gt; f1: 0.979\nTime elapsed: 0.095s\n-------------------------------------------------\nTime: 0.095s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.098s\n-------------------------------------\nSupportVectorMachine --&gt; f1: 0.979\n</code></pre>"}, {"location": "API/models/svm/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression sklearnsklearnexcuml <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> cpugpu <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> sklearnsklearnexcuml <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> cpugpu <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> <p>ParametersCFloatDistribution(high=100.0, log=True, low=0.001, step=None)kernelCategoricalDistribution(choices=('linear', 'poly', 'rbf', 'sigmoid'))degreeIntDistribution(high=5, log=False, low=2, step=1)gammaCategoricalDistribution(choices=('scale', 'auto'))coef0FloatDistribution(high=1.0, log=False, low=-1.0, step=None)epsilonFloatDistribution(high=100.0, log=True, low=0.001, step=None)shrinkingCategoricalDistribution(choices=(True, False))</p> <p></p> <p></p>"}, {"location": "API/models/svm/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/svm/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/svm/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/svm/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/tree/", "title": "DecisionTree", "text": "<p>Tree accept sparse native multilabel native multioutput</p> <p>A single decision tree classifier/regressor.</p> <p>Corresponding estimators are:</p> <ul> <li>DecisionTreeClassifier for classification tasks.</li> <li>DecisionTreeRegressor for regression tasks.</li> </ul> <p>Read more in sklearn's documentation.</p> <p></p> <p>See Also</p> <p>ExtraTree Extremely Randomized Tree.</p> <p>ExtraTrees Extremely Randomized Trees.</p> <p>RandomForest Random Forest.</p> <p></p>"}, {"location": "API/models/tree/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"Tree\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: Tree\nMetric: f1\n\n\nResults for DecisionTree:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9589\nTime elapsed: 0.032s\n-------------------------------------------------\nTime: 0.032s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.035s\n-------------------------------------\nDecisionTree --&gt; f1: 0.9589\n</code></pre>"}, {"location": "API/models/tree/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>ParameterscriterionCategoricalDistribution(choices=('gini', 'entropy'))splitterCategoricalDistribution(choices=('best', 'random'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p>ParameterscriterionCategoricalDistribution(choices=('squared_error', 'absolute_error', 'friedman_mse', 'poisson'))splitterCategoricalDistribution(choices=('best', 'random'))max_depthCategoricalDistribution(choices=(None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16))min_samples_splitIntDistribution(high=20, log=False, low=2, step=1)min_samples_leafIntDistribution(high=20, log=False, low=1, step=1)max_featuresCategoricalDistribution(choices=(None, 'sqrt', 'log2', 0.5, 0.6, 0.7, 0.8, 0.9))ccp_alphaFloatDistribution(high=0.035, log=False, low=0.0, step=0.005)</p> <p></p> <p></p>"}, {"location": "API/models/tree/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/tree/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/tree/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/tree/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/models/xgb/", "title": "XGBoost", "text": "<p>XGB needs scaling accept sparse allows validation supports acceleration</p> <p>XGBoost is an optimized distributed gradient boosting model designed to be highly efficient, flexible and portable. XGBoost provides a parallel tree boosting that solve many data science problems in a fast and accurate way.</p> <p>Corresponding estimators are:</p> <ul> <li>XGBClassifier for classification tasks.</li> <li>XGBRegressor for regression tasks.</li> </ul> <p>Read more in XGBoost's documentation.</p> <p></p> <p>See Also</p> <p>CatBoost Cat Boosting Machine.</p> <p>GradientBoostingMachine Gradient Boosting Machine.</p> <p>LightGBM Light Gradient Boosting Machine.</p> <p></p>"}, {"location": "API/models/xgb/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(models=\"XGB\", metric=\"f1\", verbose=2)\n\n\nTraining ========================= &gt;&gt;\nModels: XGB\nMetric: f1\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9583\nTime elapsed: 0.401s\n-------------------------------------------------\nTime: 0.401s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.404s\n-------------------------------------\nXGBoost --&gt; f1: 0.9583\n</code></pre>"}, {"location": "API/models/xgb/#hyperparameters", "title": "Hyperparameters", "text": "classificationregression <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=20, log=False, low=1, step=1)gammaFloatDistribution(high=1.0, log=False, low=0.0, step=None)min_child_weightIntDistribution(high=10, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)</p> <p>Parametersn_estimatorsIntDistribution(high=500, log=False, low=20, step=10)learning_rateFloatDistribution(high=1.0, log=True, low=0.01, step=None)max_depthIntDistribution(high=20, log=False, low=1, step=1)gammaFloatDistribution(high=1.0, log=False, low=0.0, step=None)min_child_weightIntDistribution(high=10, log=False, low=1, step=1)subsampleFloatDistribution(high=1.0, log=False, low=0.5, step=0.1)colsample_bytreeFloatDistribution(high=1.0, log=False, low=0.4, step=0.1)reg_alphaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)reg_lambdaFloatDistribution(high=100.0, log=True, low=0.0001, step=None)</p> <p></p> <p></p>"}, {"location": "API/models/xgb/#attributes", "title": "Attributes", "text": ""}, {"location": "API/models/xgb/#data-attributes", "title": "Data attributes", "text": "<p>Attributespipeline: PipelinePipeline of transforms. <p>Models that used automated feature scaling have the scaler added.</p> <p>Tip</p> <p>Use the plot_pipeline method to visualize the pipeline.</p> <p>mapping: dict[str, dict[collections.abc.Hashable, int | numpy.integer | float | numpy.floating]]Encoded values and their respective mapped values. <p>The column name is the key to its mapping dictionary. Only for columns mapped to a single column (e.g., Ordinal, Leave-one-out, etc...). dataset: DataFrameComplete data set. train: DataFrameTraining set. test: DataFrameTest set. X: DataFrameFeature set. y: Series | DataFrameTarget column. X_train: DataFrameFeatures of the training set. y_train: Series | DataFrameTarget column of the training set. X_test: DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. X_holdout: DataFrame | NoneFeatures of the holdout set. y_holdout: Series | DataFrame | NoneTarget column of the holdout set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/models/xgb/#utility-attributes", "title": "Utility attributes", "text": "<p>Attributesname: strName of the model. <p>Use the property's <code>@setter</code> to change the model's name. The acronym always stays at the beginning of the model's name. If the model is being tracked by mlflow, the name of the corresponding run also changes. run: RunMlflow run corresponding to this model. <p>This property is only available for models that with mlflow tracking enabled. study: StudyOptuna study used for hyperparameter tuning. <p>This property is only available for models that ran hyperparameter tuning. trials: pd.DataFrameOverview of the trials' results. <p>This property is only available for models that ran hyperparameter tuning. All durations are in seconds. Columns include:</p> <ul> <li>[param_name]: Parameter value used in this trial.</li> <li>estimator: Estimator used in this trial.</li> <li>[metric_name]: Metric score of the trial.</li> <li>[best_metric_name]: Best score so far in this study.</li> <li>time_trial: Duration of the trial.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>state: Trial's state (COMPLETE, PRUNED, FAIL). best_trial: FrozenTrialTrial that returned the highest score. <p>For multi-metric runs, the best trial is the trial that performed best on the main metric. Use the property's <code>@setter</code> to change the best trial. See here an example. This property is only available for models that ran hyperparameter tuning. best_params: dict[str, Any]Estimator's parameters in the best trial. <p>This property is only available for models that ran hyperparameter tuning. estimator: PredictorEstimator fitted on the training set. evals: dict[str, list[Float]]Scores obtained per iteration of the training. <p>Only the scores of the main metric are tracked. Included keys are: train and test. This property is only available for models with in-training-validation. bootstrap: pd.DataFrameOverview of the bootstrapping scores. <p>The dataframe has shape=(n_bootstrap, metric) and shows the score obtained by every bootstrapped sample for every metric. Using <code>atom.bootstrap.mean()</code> yields the same values as <code>[metric]_bootstrap</code>. This property is only available for models that ran bootstrapping. results: pd.SeriesOverview of the model results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. feature_importance: pd.SeriesNormalized feature importance scores. <p>The sum of importances for all features is 1. The scores are extracted from the estimator's <code>scores_</code>, <code>coef_</code> or <code>feature_importances_</code> attribute, checked in that order. This property is only available for estimators with at least one of those attributes. </p> <p></p>"}, {"location": "API/models/xgb/#methods", "title": "Methods", "text": "<p>The plots can be called directly from the model. The remaining utility methods can be found hereunder.</p> <p>bootstrappingApply a bootstrap algorithm.calibrateCalibrate the model.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from the model.create_appCreate an interactive app to test model predictions.create_dashboardCreate an interactive dashboard to analyze the model.cross_validateEvaluate the model using cross-validation.decision_functionGet confidence scores on new data or existing rows.evaluateGet the model's scores for the provided metrics.export_pipelineExport the transformer pipeline with final estimator.fitFit and validate the model.full_trainTrain the estimator on the complete dataset.get_best_thresholdGet the threshold that maximizes the ROC curve.hyperparameter_tuningRun the hyperparameter tuning algorithm.inverse_transformInversely transform new data through the pipeline.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.registerRegister the model in mlflow's model registry.reset_aestheticsReset the plot aesthetics to their default values.save_estimatorSave the estimator to a pickle file.scoreGet a metric score on new data.serveServe the model as rest API endpoint for inference.transformTransform new data through the pipeline.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.</p> <p></p> <p>method bootstrapping(n_bootstrap, reset=False)[source]Apply a bootstrap algorithm.</p> <p>Take bootstrapped samples from the training set and test them on the test set to get a distribution of the model's results.</p> <p>Parametersn_bootstrap: int umber of bootstrapped samples to fit on. <p>reset: bool, default=False Whether to start a new run or continue the existing one. </p> <p></p> <p>method calibrate(**kwargs)[source]Calibrate the model.</p> <p>Applies probability calibration on the model. The estimator is trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started using the name <code>[model_name]_calibrate</code>. Since the estimator changed, the model is cleared. Only for classifiers.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's CCV. Using cv=\"prefit\" will use the trained model and fit the calibrator on the test set. Use this only if you have another, independent set for testing. </p> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from the model.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Cached predictions.</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method create_app(**kwargs)[source]Create an interactive app to test model predictions.</p> <p>Demo your machine learning model with a friendly web interface. This app launches directly in the notebook or on an external browser page. The created Interface instance can be accessed through the <code>app</code> attribute.</p> <p>Parameters**kwargs Additional keyword arguments for the Interface instance or the Interface.launch method. </p> <p></p> <p>method create_dashboard(rows=\"test\", filename=None, **kwargs)[source]Create an interactive dashboard to analyze the model.</p> <p>ATOM uses the explainerdashboard package to provide a quick and easy way to analyze and explain the predictions and workings of the model. The dashboard allows you to investigate SHAP values, permutation importances, interaction effects, partial dependence plots, all kinds of performance plots, and even individual decision trees.</p> <p>By default, the dashboard renders in a new tab in your default browser, but if preferable, you can render it inside the notebook using the <code>mode=\"inline\"</code> parameter. The created ExplainerDashboard instance can be accessed through the <code>dashboard</code> attribute. This method is not available for multioutput tasks.</p> <p>Note</p> <p>Plots displayed by the dashboard are not created by ATOM and can differ from those retrieved through this package.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to get the report from. <p>filename: str, Path or None, default=None Filename or pathlib.Path of the file to save. None to not save anything. <p>**kwargs Additional keyword arguments for the ExplainerDashboard instance. </p> <p></p> <p>method cross_validate(**kwargs)[source]Evaluate the model using cross-validation.</p> <p>This method cross-validates the whole pipeline on the complete dataset. Use it to assess the robustness of the solution's performance.</p> <p>Parameters**kwargs Additional keyword arguments for sklearn's cross_validate function. If the scoring method is not specified, it uses atom's metric. <p>Returnspd.DataFrame Overview of the results. </p> <p></p> <p>method decision_function(X, verbose=None)[source]Get confidence scores on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>decision_function</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predicted confidence scores with shape=(n_samples,) for binary classification tasks or shape=(n_samples, n_classes) for multiclass classification tasks. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get the model's scores for the provided metrics.</p> <p>Tip</p> <p>Use the self-get_best_threshold or plot_threshold method to determine a suitable value for the <code>threshold</code> parameter.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metrics to calculate. If None, a selection of the most common metrics per task are used. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column, as returned by the get_best_threshold method). If float, the same threshold is applied to all target columns.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.Series Scores of the model. </p> <p></p> <p>method export_pipeline()[source]Export the transformer pipeline with final estimator.</p> <p>The returned pipeline is already fitted on the training set. Note that if the model used automated feature scaling, the Scaler is added to the pipeline.</p> <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method fit(X=None, y=None)[source]Fit and validate the model.</p> <p>The estimator is fitted using the best hyperparameters found during hyperparameter tuning. Afterwards, the estimator is evaluated on the test set. Only use this method to re-fit the model after having continued the study.</p> <p>ParametersX: dataframe or None Feature set with shape=(n_samples, n_features). If None, <code>self.X_train</code> is used. <p>y: series, dataframe or None Target column corresponding to `X`. If None, <code>self.y_train</code> is used. </p> <p></p> <p>method full_train(include_holdout=False)[source]Train the estimator on the complete dataset.</p> <p>In some cases, it might be desirable to use all available data to train a final model. Note that doing this means that the estimator can no longer be evaluated on the test set. The newly retrained estimator will replace the <code>estimator</code> attribute. If there is an active mlflow experiment, a new run is started with the name <code>[model_name]_full_train</code>. Since the estimator changed, the model is cleared.</p> <p>Warning</p> <p>Although the model is trained on the complete dataset, the pipeline is not. To get a fully trained pipeline, use: <code>pipeline = atom.export_pipeline().fit(atom.X, atom.y)</code>.</p> <p>Parametersinclude_holdout: bool, default=False Whether to include the holdout set (if available) in the training of the estimator. It's discouraged to use this option since it means the model can no longer be evaluated on any set. </p> <p></p> <p>method get_best_threshold(rows=\"train\")[source]Get the threshold that maximizes the ROC curve.</p> <p>Only available for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersrows: hashable, segment, sequence or dataframe Selection of rows on which to calculate the threshold. <p>Returnsfloat or list Best threshold or list of thresholds for multilabel tasks. </p> <p></p> <p>method hyperparameter_tuning(n_trials, reset=False)[source]Run the hyperparameter tuning algorithm.</p> <p>Search for the best combination of hyperparameters. The function to optimize is evaluated either with a K-fold cross-validation on the training set or using a random train and validation split every trial. Use this method to continue the optimization.</p> <p>Parametersn_trials: int Number of trials for the hyperparameter tuning. <p>reset: bool, default=False Whether to start a new study or continue the existing one. </p> <p></p> <p>method inverse_transform(X=None, y=None, verbose=None)[source]Inversely transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. The rest should all implement a <code>inverse_transform</code> method. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, inversely transform only the target column. If called from a model that used automated feature scaling, the scaling is inverted as well.</p> <p>ParametersX: dataframe-like or None, default=None Transformed feature set with shape=(n_samples, n_features). If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Original feature set. Only returned if provided. <p>series or dataframe Original target column. Only returned if provided. </p> <p></p> <p>method predict(X, verbose=None)[source]Get predictions on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsseries or dataframe Predictions with shape=(n_samples,) or shape=(n_samples, n_targets) for multioutput tasks. </p> <p></p> <p>method predict_log_proba(X, verbose=None)[source]Get class log-probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_log_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class log-probability predictions with shape=(n_samples, n_classes). </p> <p></p> <p>method predict_proba(X, verbose=None)[source]Get class probabilities on new data or existing rows.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped. The estimator must have a <code>predict_proba</code> method.</p> <p>Read more in the user guide.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Class probability predictions with shape=(n_samples, n_classes) or (n_targets * n_samples, n_classes) with a multiindex format for multioutput tasks. </p> <p></p> <p>method register(name=None, stage=\"None\", archive_existing_versions=False)[source]Register the model in mlflow's model registry.</p> <p>This method is only available when model tracking is enabled using one of the following URI schemes: databricks, http, https, postgresql, mysql, sqlite, mssql.</p> <p>Parametersname: str or None, default=None Name for the registered model. If None, the model's full name is used. If the name of the model already exists, a new model version is created. <p>stage: str, default=\"None\" New desired stage for the model. <p>archive_existing_versions: bool, default=False Whether all existing model versions in the <code>stage</code> will be moved to the \"Archived\" stage. Only valid when <code>stage</code> is \"Staging\" or \"Production\", otherwise an error will be raised. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method save_estimator(filename=\"auto\")[source]Save the estimator to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. </p> <p></p> <p>method score(X, y=None, metric=None, sample_weight=None, verbose=None)[source]Get a metric score on new data.</p> <p>New data is first transformed through the model's pipeline. Transformers that are only applied on the training set are skipped.</p> <p>Read more in the user guide.</p> <p>Info</p> <p>If the <code>metric</code> parameter is left to its default value, the method returns atom's metric score, not the metric returned by sklearn's score method for estimators.</p> <p>ParametersX: hashable, segment, sequence or dataframe-like Selection of rows or feature set with shape=(n_samples, n_features) to make predictions on. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: <code>X</code> must be a selection of rows in the dataset.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>metric: str, func, scorer or None, default=None Metric to calculate. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred) -&gt; score</code> or a scorer object. If None, it uses atom's metric (the main metric for multi-metric runs). <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsfloat Metric score of X with respect to y. </p> <p></p> <p>method serve(method=\"predict\", host=\"127.0.0.1\", port=8000)[source]Serve the model as rest API endpoint for inference.</p> <p>The complete pipeline is served with the model. The inference data must be supplied as json to the HTTP request, e.g. <code>requests.get(\"http://127.0.0.1:8000/\", json=X.to_json())</code>. The deployment is done on a ray cluster. The default <code>host</code> and <code>port</code> parameters deploy to localhost.</p> <p>Tip</p> <p>Use <code>import ray; ray.serve.shutdown()</code> to close the endpoint after finishing.</p> <p>Parametersmethod: str, default=\"predict\" Estimator's method to do inference on. <p>host: str, default=\"127.0.0.1\" Host for HTTP servers to listen on. To expose serve publicly, you probably want to set this to \"0.0.0.0\". <p>port: int, default=8000 Port for HTTP server. </p> <p></p> <p>method transform(X=None, y=None, verbose=None)[source]Transform new data through the pipeline.</p> <p>Transformers that are only applied on the training set are skipped. If only <code>X</code> or only <code>y</code> is provided, it ignores transformers that require the other parameter. This can be of use to, for example, transform only the target column. If called from a model that used automated feature scaling, the data is scaled as well.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. If None, X is ignored in the transformers. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>verbose: int or None, default=None Verbosity level for the transformers in the pipeline. If None, it uses the pipeline's verbosity. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p>"}, {"location": "API/nlp/textcleaner/", "title": "TextCleaner", "text": "<p>class atom.nlp.TextCleaner(decode=True, lower_case=True, drop_email=True, regex_email=None, drop_url=True, regex_url=None, drop_html=True, regex_html=None, drop_emoji=True, regex_emoji=None, drop_number=True, regex_number=None, drop_punctuation=True, verbose=0, logger=None)[source]Applies standard text cleaning to the corpus.</p> <p>Transformations include normalizing characters and dropping noise from the text (emails, HTML tags, URLs, etc...). The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised.</p> <p>This class can be accessed from atom through the textclean method. Read more in the user guide.</p> <p>Parametersdecode: bool, default=True Whether to decode unicode characters to their ascii representations. <p>lower_case: bool, default=True Whether to convert all characters to lower case. <p>drop_email: bool, default=True Whether to drop email addresses from the text. <p>regex_email: str, default=None Regex used to search for email addresses. If None, it uses <code>r\"[\\w.-]+@[\\w-]+\\.[\\w.-]+\"</code>. <p>drop_url: bool, default=True Whether to drop URL links from the text. <p>regex_url: str, default=None Regex used to search for URLs. If None, it uses <code>r\"https?://\\S+|www\\.\\S+\"</code>. <p>drop_html: bool, default=True Whether to drop HTML tags from the text. This option is particularly useful if the data was scraped from a website. <p>regex_html: str, default=None Regex used to search for html tags. If None, it uses <code>r\"&lt;.*?&gt;\"</code>. <p>drop_emoji: bool, default=True Whether to drop emojis from the text. <p>regex_emoji: str, default=None Regex used to search for emojis. If None, it uses <code>r\":[a-z_]+:\"</code>. <p>drop_number: bool, default=True Whether to drop numbers from the text. <p>regex_number: str, default=None Regex used to search for numbers. If None, it uses <code>r\"\\b\\d+\\b\".</code> Note that numbers adjacent to letters are not removed. <p>drop_punctuation: bool, default=True Whether to drop punctuations from the text. Characters considered punctuation are <code>!\"#$%&amp;'()*+,-./:;&lt;=&gt;?@[\\]^_</code>~`. <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p></p> <p></p> <p>See Also</p> <p>TextNormalizer Normalize the corpus.</p> <p>Tokenizer Tokenize the corpus.</p> <p>Vectorizer Vectorize text data.</p> <p></p>"}, {"location": "API/nlp/textcleaner/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import fetch_20newsgroups\n\n&gt;&gt;&gt; X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n&gt;&gt;&gt; X = np.array(X).reshape(-1, 1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n                                                 corpus  target\n0     From: fabian@vivian.w.open.de (Fabian Hoppe)\\n...       1\n1     From: nyeda@cnsvax.uwec.edu (David Nye)\\nSubje...       0\n2     From: urathi@net4.ICS.UCI.EDU (Unmesh Rathi)\\n...       1\n3     From: inoue@crd.yokogawa.co.jp (Inoue Takeshi)...       1\n4     From: sandvik@newton.apple.com (Kent Sandvik)\\...       0\n...                                                 ...     ...\n1662  From: kutluk@ccl.umist.ac.uk (Kutluk Ozguven)\\...       0\n1663  From: dmp1@ukc.ac.uk (D.M.Procida)\\nSubject: R...       2\n1664  From: tdunbar@vtaix.cc.vt.edu (Thomas Dunbar)\\...       1\n1665  From: dmp@fig.citib.com (Donna M. Paino)\\nSubj...       2\n1666  From: cdm@pmafire.inel.gov (Dale Cook)\\nSubjec...       2\n\n[1667 rows x 2 columns]\n\n\n&gt;&gt;&gt; atom.textclean(verbose=2)\n\nFitting TextCleaner...\nCleaning the corpus...\n --&gt; Decoding unicode characters to ascii.\n --&gt; Converting text to lower case.\n --&gt; Dropping emails from documents.\n --&gt; Dropping URL links from documents.\n --&gt; Dropping HTML tags from documents.\n --&gt; Dropping emojis from documents.\n --&gt; Dropping numbers from documents.\n --&gt; Dropping punctuation from the text.\n\n\n&gt;&gt;&gt; print(atom.dataset)\n\n                                                 corpus  target\n0     from  fabian hoppe\\nsubject searching cadsoftw...       1\n1     from  david nye\\nsubject re after  years can w...       0\n2     from  unmesh rathi\\nsubject motif and intervie...       1\n3     from  inoue takeshi\\nsubject how to see charac...       1\n4     from  kent sandvik\\nsubject re slavery was re ...       0\n...                                                 ...     ...\n1662  from  kutluk ozguven\\nsubject re jewish settle...       0\n1663  from  dmprocida\\nsubject re homeopathy a respe...       2\n1664  from  thomas dunbar\\nsubject re x toolkits\\nsu...       1\n1665  from  donna m paino\\nsubject psoriatic arthrit...       2\n1666  from  dale cook\\nsubject re morbus meniere  is...       2\n\n[1667 rows x 2 columns]\n</code></pre> <pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom.nlp import TextCleaner\n&gt;&gt;&gt; from sklearn.datasets import fetch_20newsgroups\n\n&gt;&gt;&gt; X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n&gt;&gt;&gt; X = np.array(X).reshape(-1, 1)\n\n&gt;&gt;&gt; textcleaner = TextCleaner(verbose=2)\n&gt;&gt;&gt; X = textcleaner.transform(X)\n\nCleaning the corpus...\n --&gt; Decoding unicode characters to ascii.\n --&gt; Converting text to lower case.\n --&gt; Dropping emails from documents.\n --&gt; Dropping URL links from documents.\n --&gt; Dropping HTML tags from documents.\n --&gt; Dropping emojis from documents.\n --&gt; Dropping numbers from documents.\n --&gt; Dropping punctuation from the text.\n\n\n&gt;&gt;&gt; print(X)\n\n                                                 corpus\n0     from  mark a deloura\\nsubject looking for x wi...\n1     from  der mouse\\nsubject re creating  bit wind...\n2     from  keith m ryan\\nsubject re where are they ...\n3     from  steven grimm\\nsubject re opinions on all...\n4     from  peter kaminski\\nsubject re krillean phot...\n...                                                 ...\n1662  from donald mackie \\nsubject re seeking advice...\n1663  from  gordon banks\\nsubject re update help was...\n1664  from  keith m ryan\\nsubject re political athei...\n1665  from  benedikt rosenau\\nsubject re biblical ra...\n1666  from derrick j brashear \\nsubject mouseless op...\n\n[1667 rows x 1 columns]\n</code></pre>"}, {"location": "API/nlp/textcleaner/#methods", "title": "Methods", "text": "<p>fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformApply the transformations to the data.</p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Do nothing.</p> <p>Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Apply the transformations to the data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents. <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed corpus. </p> <p></p>"}, {"location": "API/nlp/textnormalizer/", "title": "TextNormalizer", "text": "<p>class atom.nlp.TextNormalizer(stopwords=True, custom_stopwords=None, stem=False, lemmatize=True, verbose=0, logger=None)[source]Normalize the corpus.</p> <p>Convert words to a more uniform standard. The transformations are applied on the column named <code>corpus</code>, in the same order the parameters are presented. If there is no column with that name, an exception is raised. If the provided documents are strings, words are separated by spaces.</p> <p>This class can be accessed from atom through the textnormalize method. Read more in the user guide.</p> <p>Parametersstopwords: bool or str, default=True Whether to remove a predefined dictionary of stopwords. <ul> <li>If False: Don't remove any predefined stopwords.</li> <li>If True: Drop predefined english stopwords from the text.</li> <li>If str: Language from <code>nltk.corpus.stopwords.words</code>.</li> </ul> <p>custom_stopwords: sequence or None, default=None Custom stopwords to remove from the text. <p>stem: bool or str, default=False Whether to apply stemming using SnowballStemmer. <ul> <li>If False: Don't apply stemming.</li> <li>If True: Apply stemmer based on the english language.</li> <li>If str: Language from <code>SnowballStemmer.languages</code>.</li> </ul> <p>lemmatize: bool, default=True Whether to apply lemmatization using WordNetLemmatizer. <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>Attributesfeature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>TextCleaner Applies standard text cleaning to the corpus.</p> <p>Tokenizer Tokenize the corpus.</p> <p>Vectorizer Vectorize text data.</p> <p></p>"}, {"location": "API/nlp/textnormalizer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n\n&gt;&gt;&gt; X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n&gt;&gt;&gt; y = [1, 0, 0, 1, 1, 1, 0, 0]\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, test_size=2, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n                               corpus  target\n0                            new york       0\n1                     another line...       1\n2                    New york is nice       0\n3  new york is larger than washington       1\n4                    running the test       0\n5                   I \u00e0m in ne'w york       1\n6                      this is a test       0\n7            hi there this is a test!       1\n\n\n&gt;&gt;&gt; atom.textnormalize(stopwords=\"english\", lemmatize=True, verbose=2)\n\nFitting TextNormalizer...\nNormalizing the corpus...\n --&gt; Dropping stopwords.\n --&gt; Applying lemmatization.\n\n\n&gt;&gt;&gt; print(atom.dataset)\n\n                           corpus  target\n0                     [new, york]       0\n1              [another, line...]       1\n2               [New, york, nice]       0\n3  [new, york, large, washington]       1\n4                     [run, test]       0\n5             [I, \u00e0m, ne'w, york]       1\n6                          [test]       0\n7                     [hi, test!]       1\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.nlp import TextNormalizer\n\n&gt;&gt;&gt; X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n\n&gt;&gt;&gt; textnormalizer = TextNormalizer(\n...     stopwords=\"english\",\n...     lemmatize=True,\n...     verbose=2,\n... )\n&gt;&gt;&gt; X = textnormalizer.transform(X)\n\nNormalizing the corpus...\n --&gt; Dropping stopwords.\n --&gt; Applying lemmatization.\n\n\n&gt;&gt;&gt; print(X)\n\n                           corpus\n0             [I, \u00e0m, ne'w, york]\n1               [New, york, nice]\n2                     [new, york]\n3                     [hi, test!]\n4              [another, line...]\n5  [new, york, large, washington]\n6                     [run, test]\n7                          [test]\n</code></pre>"}, {"location": "API/nlp/textnormalizer/#methods", "title": "Methods", "text": "<p>fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformNormalize the text.</p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Do nothing.</p> <p>Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Normalize the text.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents. <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed corpus. </p> <p></p>"}, {"location": "API/nlp/tokenizer/", "title": "Tokenizer", "text": "<p>class atom.nlp.Tokenizer(bigram_freq=None, trigram_freq=None, quadgram_freq=None, verbose=0, logger=None)[source]Tokenize the corpus.</p> <p>Convert documents into sequences of words. Additionally, create n-grams (represented by words united with underscores, e.g., \"New_York\") based on their frequency in the corpus. The transformations are applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>This class can be accessed from atom through the tokenize method. Read more in the user guide.</p> <p>Parametersbigram_freq: int, float or None, default=None Frequency threshold for bigram creation. <ul> <li>If None: Don't create any bigrams.</li> <li>If int: Minimum number of occurrences to make a bigram.</li> <li>If float: Minimum frequency fraction to make a bigram.</li> </ul> <p>trigram_freq: int, float or None, default=None Frequency threshold for trigram creation. <ul> <li>If None: Don't create any trigrams.</li> <li>If int: Minimum number of occurrences to make a trigram.</li> <li>If float: Minimum frequency fraction to make a trigram.</li> </ul> <p>quadgram_freq: int, float or None, default=None Frequency threshold for quadgram creation. <ul> <li>If None: Don't create any quadgrams.</li> <li>If int: Minimum number of occurrences to make a quadgram.</li> <li>If float: Minimum frequency fraction to make a quadgram.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>Attributesbigrams_: pd.DataFrame Created bigrams and their frequencies. <p>trigrams_: pd.DataFrame Created trigrams and their frequencies. <p>quadgrams_: pd.DataFrame Created quadgrams and their frequencies. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>TextCleaner Applies standard text cleaning to the corpus.</p> <p>TextNormalizer Normalize the corpus.</p> <p>Vectorizer Vectorize text data.</p> <p></p>"}, {"location": "API/nlp/tokenizer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n\n&gt;&gt;&gt; X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n&gt;&gt;&gt; y = [1, 0, 0, 1, 1, 1, 0, 0]\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, test_size=2, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n                               corpus  target\n0                            new york       0\n1                     another line...       1\n2                    New york is nice       0\n3  new york is larger than washington       1\n4                    running the test       0\n5                   I \u00e0m in ne'w york       1\n6                      this is a test       0\n7            hi there this is a test!       1\n\n\n&gt;&gt;&gt; atom.tokenize(verbose=2)\n\nFitting Tokenizer...\nTokenizing the corpus...\n\n\n&gt;&gt;&gt; print(atom.dataset)\n\n                                      corpus  target\n0                                [new, york]       0\n1                       [another, line, ...]       1\n2                      [New, york, is, nice]       0\n3  [new, york, is, larger, than, washington]       1\n4                       [running, the, test]       0\n5                [I, \u00e0m, in, ne, ', w, york]       1\n6                        [this, is, a, test]       0\n7          [hi, there, this, is, a, test, !]       1\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.nlp import Tokenizer\n\n&gt;&gt;&gt; X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n\n&gt;&gt;&gt; tokenizer = Tokenizer(bigram_freq=2, verbose=2)\n&gt;&gt;&gt; X = tokenizer.transform(X)\n\nTokenizing the corpus...\n --&gt; Creating 5 bigrams on 10 locations.\n\n\n&gt;&gt;&gt; print(X)\n\n                                     corpus\n0               [I, \u00e0m, in, ne, ', w, york]\n1                      [New, york_is, nice]\n2                                [new_york]\n3           [hi, there, this_is, a_test, !]\n4                      [another, line, ...]\n5  [new, york_is, larger, than, washington]\n6                      [running, the, test]\n7                         [this_is, a_test]\n</code></pre>"}, {"location": "API/nlp/tokenizer/#methods", "title": "Methods", "text": "<p>fitDo nothing.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformTokenize the text.</p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Do nothing.</p> <p>Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Tokenize the text.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents. <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed corpus. </p> <p></p>"}, {"location": "API/nlp/vectorizer/", "title": "Vectorizer", "text": "<p>class atom.nlp.Vectorizer(strategy=\"bow\", return_sparse=True, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, verbose=0, logger=None, **kwargs)[source]Vectorize text data.</p> <p>Transform the corpus into meaningful vectors of numbers. The transformation is applied on the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>If strategy=\"bow\" or \"tfidf\", the transformed columns are named after the word they are embedding with the prefix <code>corpus_</code>. If strategy=\"hashing\", the columns are named hash[N], where N stands for the n-th hashed column.</p> <p>This class can be accessed from atom through the vectorize method. Read more in the user guide.</p> <p>Parametersstrategy: str, default=\"bow\" Strategy with which to vectorize the text. Choose from: <ul> <li>\"bow\": Bag of Words.</li> <li>\"tfidf\": Term Frequency - Inverse Document Frequency.</li> <li>\"hashing\": Vectorize to a matrix of token occurrences.</li> </ul> <p>return_sparse: bool, default=True Whether to return the transformation output as a dataframe of sparse arrays. Must be False when there are other columns in X (besides <code>corpus</code>) that are non-sparse. <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic naming.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>**kwargs Additional keyword arguments for the <code>strategy</code> estimator. <p>Attributes[strategy]_: sklearn transformer Estimator instance (lowercase strategy) used to vectorize the corpus, e.g., <code>vectorizer.tfidf</code> for the tfidf strategy. <p>feature_names_in_: np.ndarray Names of features seen during fit. <p>n_features_in_: int Number of features seen during fit. <p></p> <p></p> <p>See Also</p> <p>TextCleaner Applies standard text cleaning to the corpus.</p> <p>TextNormalizer Normalize the corpus.</p> <p>Tokenizer Tokenize the corpus.</p> <p></p>"}, {"location": "API/nlp/vectorizer/#example", "title": "Example", "text": "atomstand-alone <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n\n&gt;&gt;&gt; X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n&gt;&gt;&gt; y = [1, 0, 0, 1, 1, 1, 0, 0]\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, test_size=2, random_state=1)\n&gt;&gt;&gt; print(atom.dataset)\n\n                               corpus  target\n0                            new york       0\n1                     another line...       1\n2                    New york is nice       0\n3  new york is larger than washington       1\n4                    running the test       0\n5                   I \u00e0m in ne'w york       1\n6                      this is a test       0\n7            hi there this is a test!       1\n\n\n&gt;&gt;&gt; atom.vectorize(strategy=\"tfidf\", verbose=2)\n\nFitting Vectorizer...\nVectorizing the corpus...\n\n\n&gt;&gt;&gt; print(atom.dataset)\n\n   corpus_another  corpus_in  corpus_is  corpus_larger  corpus_line  corpus_ne  corpus_new  corpus_nice  corpus_running  corpus_test  corpus_than  corpus_the  corpus_washington  corpus_york  corpus_\u00e0m  target\n0        0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.759339     0.000000         0.00000     0.000000     0.000000     0.00000           0.000000     0.650696   0.000000       0\n1        0.707107   0.000000   0.000000       0.000000     0.707107   0.000000    0.000000     0.000000         0.00000     0.000000     0.000000     0.00000           0.000000     0.000000   0.000000       1\n2        0.000000   0.000000   0.518242       0.000000     0.000000   0.000000    0.437535     0.631991         0.00000     0.000000     0.000000     0.00000           0.000000     0.374934   0.000000       0\n3        0.000000   0.000000   0.386401       0.471212     0.000000   0.000000    0.326226     0.000000         0.00000     0.000000     0.471212     0.00000           0.471212     0.279551   0.000000       1\n4        0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.000000     0.000000         0.57735     0.577350     0.000000     0.57735           0.000000     0.000000   0.000000       0\n5        0.000000   0.546199   0.000000       0.000000     0.000000   0.546199    0.000000     0.000000         0.00000     0.000000     0.000000     0.00000           0.000000     0.324037   0.546199       1\n6        0.000000   0.000000   0.634086       0.000000     0.000000   0.000000    0.000000     0.000000         0.00000     0.773262     0.000000     0.00000           0.000000     0.000000   0.000000       0\n7        0.000000   0.000000   0.634086       0.000000     0.000000   0.000000    0.000000     0.000000         0.00000     0.773262     0.000000     0.00000           0.000000     0.000000   0.000000       1\n</code></pre> <pre><code>&gt;&gt;&gt; from atom.nlp import Vectorizer\n\n&gt;&gt;&gt; X = [\n...    [\"I \u00e0m in ne'w york\"],\n...    [\"New york is nice\"],\n...    [\"new york\"],\n...    [\"hi there this is a test!\"],\n...    [\"another line...\"],\n...    [\"new york is larger than washington\"],\n...    [\"running the test\"],\n...    [\"this is a test\"],\n... ]\n\n&gt;&gt;&gt; vectorizer = Vectorizer(strategy=\"tfidf\", verbose=2)\n&gt;&gt;&gt; X = vectorizer.fit_transform(X)\n\nFitting Vectorizer...\nVectorizing the corpus...\n\n\n&gt;&gt;&gt; print(X)\n\n   corpus_another  corpus_hi  corpus_in  corpus_is  corpus_larger  corpus_line  corpus_ne  corpus_new  corpus_nice  corpus_running  corpus_test  corpus_than  corpus_the  corpus_there  corpus_this  corpus_washington  corpus_york  corpus_\u00e0m\n0        0.000000   0.000000   0.542162   0.000000       0.000000     0.000000   0.542162    0.000000     0.000000        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.343774   0.542162\n1        0.000000   0.000000   0.000000   0.415657       0.000000     0.000000   0.000000    0.474072     0.655527        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.415657   0.000000\n2        0.000000   0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.751913     0.000000        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.659262   0.000000\n3        0.000000   0.525049   0.000000   0.332923       0.000000     0.000000   0.000000    0.000000     0.000000        0.000000     0.379712     0.000000    0.000000      0.525049     0.440032           0.000000     0.000000   0.000000\n4        0.707107   0.000000   0.000000   0.000000       0.000000     0.707107   0.000000    0.000000     0.000000        0.000000     0.000000     0.000000    0.000000      0.000000     0.000000           0.000000     0.000000   0.000000\n5        0.000000   0.000000   0.000000   0.304821       0.480729     0.000000   0.000000    0.347660     0.000000        0.000000     0.000000     0.480729    0.000000      0.000000     0.000000           0.480729     0.304821   0.000000\n6        0.000000   0.000000   0.000000   0.000000       0.000000     0.000000   0.000000    0.000000     0.000000        0.629565     0.455297     0.000000    0.629565      0.000000     0.000000           0.000000     0.000000   0.000000\n7        0.000000   0.000000   0.000000   0.497041       0.000000     0.000000   0.000000    0.000000     0.000000        0.000000     0.566893     0.000000    0.000000      0.000000     0.656949           0.000000     0.000000   0.000000\n</code></pre>"}, {"location": "API/nlp/vectorizer/#methods", "title": "Methods", "text": "<p>fitFit to data.fit_transformFit to data, then transform it.get_paramsGet parameters for this estimator.inverse_transformDo nothing.set_paramsSet the parameters of this estimator.transformVectorize the text.</p> <p></p> <p>method fit(X, y=None)[source]Fit to data.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents. <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>ReturnsSelf Estimator instance. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit to data, then transform it.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Do nothing.</p> <p>Returns the input unchanged. Implemented for continuity of the API.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, sequence, dataframe-like or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput   tasks.</li> <li>If dataframe-like: Target columns with shape=(n_samples,   n_targets) for multioutput tasks.</li> </ul> <p>Returnsdataframe Feature set. Only returned if provided. <p>series or dataframe Target column. Only returned if provided. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method transform(X, y=None)[source]Vectorize the text.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). If X is not a dataframe, it should be composed of a single feature containing the text documents. <p>y: int, str, sequence, dataframe-like or None, default=None Do nothing. Implemented for continuity of the API. <p>Returnsdataframe Transformed corpus. </p> <p></p>"}, {"location": "API/pipeline/pipeline/", "title": "Pipeline", "text": "<p>class atom.pipeline.Pipeline(steps, memory=None, verbose=0)[source]Pipeline of transforms with a final estimator.</p> <p>Sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be transformsers, that is, they must implement <code>fit</code> and <code>transform</code> methods. The final estimator only needs to implement <code>fit</code>. The transformers in the pipeline can be cached using the <code>memory</code> parameter.</p> <p>The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. For this, it enables setting parameters of the various steps using their names and the parameter name separated by <code>__</code>, as in the example below. A step's estimator may be replaced entirely by setting the parameter with its name to another estimator, or a transformer removed by setting it to <code>passthrough</code> or <code>None</code>.</p> <p>Read more in sklearn's the user guide.</p> <p>Info</p> <p>This class behaves similarly to sklearn's pipeline, and additionally:</p> <ul> <li>Works with an empty pipeline.</li> <li>Accepts transformers that drop rows.</li> <li>Accepts transformers that only are fitted on a subset of the   provided dataset.</li> <li>Accepts transformers that apply only on the target column.</li> <li>Uses transformers that are only applied on the training set   to fit the pipeline, not to make predictions on new data.</li> <li>The instance is considered fitted at initialization if all   the underlying transformers/estimator in the pipeline are.</li> <li>It returns attributes from the final estimator if they are   not of the Pipeline.</li> <li>The last transformer is also cached.</li> </ul> <p>Warning</p> <p>This Pipeline only works with estimators whose parameters for fit, transform, predict, etc... are named <code>X</code> and/or <code>y</code>.</p> <p>Parameterssteps: list of tuple List of (name, transform) tuples (implementing <code>fit</code>/<code>transform</code>) that are chained in sequential order. <p>memory: str, Memory or None, default=None Used to cache the fitted transformers of the pipeline. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute <code>named_steps</code> or <code>steps</code> to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time-consuming. <p>verbose: int or None, default=0 Verbosity level of the transformers in the pipeline. If None, it leaves them to their original verbosity. If &gt;0, the time elapsed while fitting each step is printed. <p>Attributesnamed_steps: Bunch Dictionary-like object, with the following attributes. Read-only attribute to access any step parameter by user given name. Keys are step names and values are steps parameters. <p>classes_: np.ndarray of shape (n_classes,) The class' labels. Only exist if the last step of the pipeline is a classifier. <p>feature_names_in_: np.ndarray Names of features seen during first step <code>fit</code> method. <p>n_features_in_: int Number of features seen during first step <code>fit</code> method. <p></p> <p></p>"}, {"location": "API/pipeline/pipeline/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Initialize atom\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, verbose=2)\n\n&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 165 (1.2%)\n\n\n\n&gt;&gt;&gt; # Apply data cleaning and feature engineering methods\n&gt;&gt;&gt; atom.scale()\n\nFitting Scaler...\nScaling features...\n\n&gt;&gt;&gt; atom.balance(strategy=\"smote\")\n\nOversampling with SMOTE...\n --&gt; Adding 116 samples to class 0.\n\n&gt;&gt;&gt; atom.feature_selection(strategy=\"rfe\", solver=\"lr\", n_features=22)\n\nFitting FeatureSelector...\nPerforming feature selection ...\n --&gt; rfe selected 22 features from the dataset.\n   --&gt; Dropping feature mean texture (rank 2).\n   --&gt; Dropping feature mean smoothness (rank 3).\n   --&gt; Dropping feature mean symmetry (rank 9).\n   --&gt; Dropping feature texture error (rank 7).\n   --&gt; Dropping feature smoothness error (rank 4).\n   --&gt; Dropping feature concavity error (rank 5).\n   --&gt; Dropping feature worst compactness (rank 8).\n   --&gt; Dropping feature worst fractal dimension (rank 6).\n\n\n&gt;&gt;&gt; # Train models\n&gt;&gt;&gt; atom.run(models=\"LR\")\n\n\nTraining ========================= &gt;&gt;\nModels: LR\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9913\nTest evaluation --&gt; f1: 0.9787\nTime elapsed: 0.030s\n-------------------------------------------------\nTime: 0.030s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.033s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9787\n\n\n&gt;&gt;&gt; # Get the pipeline and make predictions\n&gt;&gt;&gt; pl = atom.lr.export_pipeline()\n&gt;&gt;&gt; print(pl.predict(X))\n\n[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n 1 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1\n 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1\n 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1\n 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0\n 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1\n 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0\n 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1\n 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1\n 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 0\n 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1\n 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1\n 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1\n 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 0 0\n 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n 1 1 1 1 1 1 1 0 0 0 0 0 0 1]\n</code></pre>"}, {"location": "API/pipeline/pipeline/#methods", "title": "Methods", "text": "<p>decision_functionTransform, then decision_function of the final estimator.fitFit the pipeline.fit_predictTransform the data, and apply <code>fit_predict</code> with the final estimator.fit_transformFit the pipeline and transform the data.get_feature_names_outGet output feature names for transformation.get_paramsGet parameters for this estimator.inverse_transformInverse transform for each step in a reverse order.predictTransform, then predict of the final estimator.predict_log_probaTransform, then predict_log_proba of the final estimator.predict_probaTransform, then predict_proba of the final estimator.scoreTransform, then score of the final estimator.score_samplesTransform the data, and apply <code>score_samples</code> with the final estimator.set_outputSet the output container when <code>\"transform\"</code> and <code>\"fit_transform\"</code> are called.set_paramsSet the parameters of this estimator.transformTransform the data.</p> <p></p> <p>method decision_function(X)[source]Transform, then decision_function of the final estimator.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>Returnsnp.ndarray Predicted confidence scores. </p> <p></p> <p>method fit(X=None, y=None, **fit_params)[source]Fit the pipeline.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. <p>y: int, str, dict, sequence or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>Else: Array with shape=(n_samples,) to use as target.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsself Estimator instance. </p> <p></p> <p>method fit_predict(X, y=None, **fit_params)[source]Transform the data, and apply <code>fit_predict</code> with the final estimator.</p> <p>ParametersX : iterable Training data. Must fulfill input requirements of first step of the pipeline. <p>y : iterable, default=None Training targets. Must fulfill label requirements for all steps of the pipeline. <p>**fit_params : dict of string -&gt; object Parameters passed to the <code>fit</code> method of each step, where each parameter name is prefixed such that parameter <code>p</code> for step <code>s</code> has key <code>s__p</code>. <p>Returnsy_pred : ndarray Result of calling <code>fit_predict</code> on the final estimator. </p> <p></p> <p>method fit_transform(X=None, y=None, **fit_params)[source]Fit the pipeline and transform the data.</p> <p>Call <code>fit</code> followed by <code>transform</code> on each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls the <code>transform</code> method. Only valid if the final estimator implements <code>transform</code>. This also works when the final estimator is <code>None</code>, in which case all prior transformations are applied.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the estimator only uses y. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>**fit_params Additional keyword arguments for the fit method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method get_feature_names_out(input_features=None)[source]Get output feature names for transformation.</p> <p>Parametersinput_features : array-like of str or None, default=None Input features. <p>Returnsfeature_names_out : ndarray of str objects Transformed feature names. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : mapping of string to any Parameter names mapped to their values. </p> <p></p> <p>method inverse_transform(X=None, y=None)[source]Inverse transform for each step in a reverse order.</p> <p>All estimators in the pipeline must implement the <code>inverse_transform</code> method.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the pipeline only uses y. <p>y: int, str, dict, sequence, dataframe or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>If dict: Name of the target column and sequence of values.</li> <li>If sequence: Target column with shape=(n_samples,) or   sequence of column names or positions for multioutput tasks.</li> <li>If dataframe: Target columns for multioutput tasks.</li> </ul> <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p> <p>method predict(X, **predict_params)[source]Transform, then predict of the final estimator.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>**predict_params Additional keyword arguments for the predict method. Note that while this may be used to return uncertainties from some models with return_std or return_cov, uncertainties that are generated by the transformations in the pipeline are not propagated to the final estimator. <p>Returnsnp.ndarray Predicted classes with shape=(n_samples,). </p> <p></p> <p>method predict_log_proba(X)[source]Transform, then predict_log_proba of the final estimator.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>Returnsnp.ndarray Predicted class log-probabilities. </p> <p></p> <p>method predict_proba(X)[source]Transform, then predict_proba of the final estimator.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>Returnsnp.ndarray Predicted class probabilities. </p> <p></p> <p>method score(X, y, sample_weight=None)[source]Transform, then score of the final estimator.</p> <p>ParametersX: dataframe-like Feature set with shape=(n_samples, n_features). <p>y: int, str, dict, sequence <ul> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>Else: Array with shape=(n_samples,) to use as target.</li> </ul> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y. <p>Returnsfloat Mean accuracy or r2 of self.predict(X) with respect to y. </p> <p></p> <p>method score_samples(X)[source]Transform the data, and apply <code>score_samples</code> with the final estimator.</p> <p>ParametersX : iterable Data to predict on. Must fulfill input requirements of first step of the pipeline. <p>Returnsy_score : ndarray of shape (n_samples,) Result of calling <code>score_samples</code> on the final estimator. </p> <p></p> <p>method set_output(transform=None)[source]Set the output container when <code>\"transform\"</code> and <code>\"fit_transform\"</code> are called.</p> <p>Parameterstransform : {\"default\", \"pandas\"}, default=None Configure output of <code>transform</code> and <code>fit_transform</code>. <ul> <li><code>\"default\"</code>: Default output format of a transformer</li> <li><code>\"pandas\"</code>: DataFrame output</li> <li><code>None</code>: Transform configuration is unchanged</li> </ul> <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method set_params(**kwargs)[source]Set the parameters of this estimator.</p> <p>Parameters**kwargs : dict Parameters of this estimator or parameters of estimators contained in <code>steps</code>. Parameters of the steps may be set using its name and the parameter name separated by a '__'. <p>Returnsself : object Pipeline class instance. </p> <p></p> <p>method transform(X=None, y=None, **kwargs)[source]Transform the data.</p> <p>Call <code>transform</code> on each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls the <code>transform</code> method. Only valid if the final estimator implements <code>transform</code>. This also works when the final estimator is <code>None</code>, in which case all prior transformations are applied.</p> <p>ParametersX: dataframe-like or None, default=None Feature set with shape=(n_samples, n_features). If None, X is ignored. None if the pipeline only uses y. <p>y: int, str, dict, sequence or None, default=None Target column corresponding to `X`. <ul> <li>If None: y is ignored.</li> <li>If int: Position of the target column in X.</li> <li>If str: Name of the target column in X.</li> <li>Else: Array with shape=(n_samples,) to use as target.</li> </ul> <p>**kwargs Additional keyword arguments for the <code>_iter</code> inner method. <p>Returnsdataframe Transformed feature set. Only returned if provided. <p>series or dataframe Transformed target column. Only returned if provided. </p> <p></p>"}, {"location": "API/plots/plot_calibration/", "title": "plot_calibration", "text": "<p>method plot_calibration(models=None, rows=\"test\", n_bins=10, target=0, title=None, legend=\"upper left\", figsize=(900, 900), filename=None, display=True)[source]Plot the calibration curve for a binary classifier.</p> <p>Well-calibrated classifiers are probabilistic classifiers for which the output of the <code>predict_proba</code> method can be directly interpreted as a confidence level. For instance, a calibrated (binary) classifier should classify the samples such that among the samples to which it gave a <code>predict_proba</code> value close to 0.8, approx. 80% actually belong to the positive class. Read more in sklearn's documentation.</p> <p>This figure shows two plots: the calibration curve, where the x-axis represents the average predicted probability in each bin and the y-axis is the fraction of positives, i.e., the proportion of samples whose class is the positive class (in each bin); and a distribution of all predicted probabilities of the classifier. This plot is available only for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Tip</p> <p>Use the calibrate method to calibrate the winning model.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>n_bins: int, default=10 Number of bins used for calibration. Minimum of 5 required. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 900) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_lift Plot the lift curve.</p> <p>plot_prc Plot the precision-recall curve.</p> <p>plot_roc Plot the Receiver Operating Characteristics curve.</p> <p></p>"}, {"location": "API/plots/plot_calibration/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"RF\", \"LGB\"])\n&gt;&gt;&gt; atom.plot_calibration()\n</code></pre>"}, {"location": "API/plots/plot_components/", "title": "plot_components", "text": "<p>method plot_components(show=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the explained variance ratio per component.</p> <p>Kept components are colored and discarded components are transparent. This plot is available only when feature selection was applied with strategy=\"pca\".</p> <p>Parametersshow: int or None, default=None Number of components to show. None to show all. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of components shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_pca Plot the explained variance ratio vs number of components.</p> <p>plot_rfecv Plot the rfecv results.</p> <p></p>"}, {"location": "API/plots/plot_components/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.feature_selection(\"pca\", n_features=5)\n&gt;&gt;&gt; atom.plot_components(show=10)\n</code></pre>"}, {"location": "API/plots/plot_confusion_matrix/", "title": "plot_confusion_matrix", "text": "<p>method plot_confusion_matrix(models=None, rows=\"test\", target=0, threshold=0.5, title=None, legend=\"upper right\", figsize=None, filename=None, display=True)[source]Plot a model's confusion matrix.</p> <p>For one model, the plot shows a heatmap. For multiple models, it compares TP, FP, FN and TN in a barplot (not implemented for multiclass classification tasks). This plot is available only for classification tasks.</p> <p>Tip</p> <p>Fill the <code>threshold</code> parameter with the result from the model's <code>get_best_threshold</code> method to optimize the results.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: hashable, segment or sequence, default=\"test\" Selection of rows on which to calculate the confusion matrix. <p>target: int or str, default=0 Target column to look at. Only for multioutput tasks. <p>threshold: float, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only for binary classification tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the plot's type. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_calibration Plot the calibration curve for a binary classifier.</p> <p>plot_threshold Plot metric performances against threshold values.</p> <p></p>"}, {"location": "API/plots/plot_confusion_matrix/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, test_size=0.4)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.lr.plot_confusion_matrix()  # For one model\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_confusion_matrix()  # For multiple models\n</code></pre>"}, {"location": "API/plots/plot_correlation/", "title": "plot_correlation", "text": "<p>method plot_correlation(columns=None, method=\"pearson\", title=None, legend=None, figsize=(800, 700), filename=None, display=True)[source]Plot a correlation matrix.</p> <p>Displays a heatmap showing the correlation between columns in the dataset. The colors red, blue and white stand for positive, negative, and no correlation respectively.</p> <p>Parameterscolumns: segment, sequence, dataframe or None, default=None Columns to plot. If None, plot all columns in the dataset. Selected categorical columns are ignored. <p>method: str, default=\"pearson\" Method of correlation. Choose from: pearson, kendall or spearman. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple, default=(800, 700) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_distribution Plot column distributions.</p> <p>plot_qq Plot a quantile-quantile plot.</p> <p>plot_relationships Plot pairwise relationships in a dataset.</p> <p></p>"}, {"location": "API/plots/plot_correlation/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.plot_correlation()\n</code></pre>"}, {"location": "API/plots/plot_det/", "title": "plot_det", "text": "<p>method plot_det(models=None, rows=\"test\", target=0, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the Detection Error Tradeoff curve.</p> <p>Read more about DET in sklearn's documentation. Only available for binary classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_gains Plot the cumulative gains curve.</p> <p>plot_roc Plot the Receiver Operating Characteristics curve.</p> <p>plot_prc Plot the precision-recall curve.</p> <p></p>"}, {"location": "API/plots/plot_det/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_det()\n</code></pre>"}, {"location": "API/plots/plot_distribution/", "title": "plot_distribution", "text": "<p>method plot_distribution(columns=0, distributions=\"kde\", show=None, title=None, legend=\"upper right\", figsize=None, filename=None, display=True)[source]Plot column distributions.</p> <ul> <li>For numerical columns, plot the probability density   distribution. Additionally, it's possible to plot any of   <code>scipy.stats</code> distributions fitted to the column.</li> <li>For categorical columns, plot the class distribution.   Only one categorical column can be plotted at the same time.</li> </ul> <p>Tip</p> <p>Use atom's distribution method to check which distribution fits the column best.</p> <p>Parameterscolumns: int, str, slice or sequence, default=0 Columns to plot. It's only possible to plot one categorical column. If more than one categorical column is selected, all categorical columns are ignored. <p>distributions: str, sequence or None, default=\"kde\" Distributions to fit. Only for numerical columns. <ul> <li>If None: No distribution is fit.</li> <li>If \"kde\": Fit a Gaussian kde distribution.</li> <li>Else: Name of a <code>scipy.stats</code> distribution.</li> </ul> <p>show: int or None, default=None Number of classes (ordered by number of occurrences) to show in the plot. If None, it shows all classes. Only for categorical columns. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None: No title is shown.</li> <li>If str: Text for the title.</li> <li>If dict: title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the plot's type. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_correlation Plot a correlation matrix.</p> <p>plot_qq Plot a quantile-quantile plot.</p> <p>plot_relationships Plot pairwise relationships in a dataset.</p> <p></p>"}, {"location": "API/plots/plot_distribution/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; # Add a categorical feature\n&gt;&gt;&gt; animals = [\"cat\", \"dog\", \"bird\", \"lion\", \"zebra\"]\n&gt;&gt;&gt; probabilities = [0.001, 0.1, 0.2, 0.3, 0.399]\n&gt;&gt;&gt; X[\"animals\"] = np.random.choice(animals, size=len(X), p=probabilities)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.plot_distribution(columns=[0, 1])\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_distribution(columns=0, distributions=[\"norm\", \"invgauss\"])\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_distribution(columns=\"animals\")\n</code></pre>"}, {"location": "API/plots/plot_edf/", "title": "plot_edf", "text": "<p>method plot_edf(models=None, metric=None, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot the Empirical Distribution Function of a study.</p> <p>Use this plot to analyze and improve hyperparameter search spaces. The EDF assumes that the value of the objective function is in accordance with the uniform distribution over the objective space. This plot is only available for models that ran hyperparameter tuning.</p> <p>Note</p> <p>Only complete trials are considered when plotting the EDF.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected. <p>metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). If str, add <code>+</code> between options to select more than one. If None, the metric used to run the pipeline is selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_hyperparameters Plot hyperparameter relationships in a study.</p> <p>plot_trials Plot the hyperparameter tuning trials.</p> <p></p>"}, {"location": "API/plots/plot_edf/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from optuna.distributions import IntDistribution\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n\n&gt;&gt;&gt; # Run three models with different search spaces\n&gt;&gt;&gt; atom.run(\n...     models=\"RF_1\",\n...     n_trials=20,\n...     ht_params={\"distributions\": {\"n_estimators\": IntDistribution(6, 10)}},\n... )\n&gt;&gt;&gt; atom.run(\n...     models=\"RF_2\",\n...     n_trials=20,\n...     ht_params={\"distributions\": {\"n_estimators\": IntDistribution(11, 15)}},\n... )\n&gt;&gt;&gt; atom.run(\n...     models=\"RF_3\",\n...     n_trials=20,\n...     ht_params={\"distributions\": {\"n_estimators\": IntDistribution(16, 20)}},\n... )\n\n&gt;&gt;&gt; atom.plot_edf()\n</code></pre>"}, {"location": "API/plots/plot_errors/", "title": "plot_errors", "text": "<p>method plot_errors(models=None, rows=\"test\", target=0, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot a model's prediction errors.</p> <p>Plot the actual targets from a set against the predicted values generated by the regressor. A linear fit is made on the data. The gray, intersected line shows the identity line. This plot can be useful to detect noise or heteroscedasticity along a range of the target domain. This plot is available only for regression tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multioutput tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_residuals Plot a model's residuals.</p> <p></p>"}, {"location": "API/plots/plot_errors/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_diabetes\n\n&gt;&gt;&gt; X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y)\n&gt;&gt;&gt; atom.run([\"OLS\", \"LGB\"])\n&gt;&gt;&gt; atom.plot_errors()\n</code></pre>"}, {"location": "API/plots/plot_evals/", "title": "plot_evals", "text": "<p>method plot_evals(models=None, dataset=\"test\", title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot evaluation curves.</p> <p>The evaluation curves are the main metric scores achieved by the models at every iteration of the training process. This plot is available only for models that allow in-training validation.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>dataset: str, default=\"test\" Data set for which to plot the evaluation curves. Use <code>+</code> between options to select more than one. Choose from: \"train\", \"test\". <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_trials Plot the hyperparameter tuning trials.</p> <p></p>"}, {"location": "API/plots/plot_evals/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"XGB\", \"LGB\"])\n&gt;&gt;&gt; atom.plot_evals()\n</code></pre>"}, {"location": "API/plots/plot_feature_importance/", "title": "plot_feature_importance", "text": "<p>method plot_feature_importance(models=None, show=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot a model's feature importance.</p> <p>The sum of importances for all features (per model) is 1. This plot is available only for models whose estimator has a <code>scores_</code>, <code>feature_importances_</code> or <code>coef</code> attribute.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_parshap Plot the partial correlation of shap values.</p> <p>plot_partial_dependence Plot the partial dependence of features.</p> <p>plot_permutation_importance Plot the feature permutation importance of models.</p> <p></p>"}, {"location": "API/plots/plot_feature_importance/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_feature_importance(show=10)\n</code></pre>"}, {"location": "API/plots/plot_forecast/", "title": "plot_forecast", "text": "<p>method plot_forecast(models=None, fh=\"test\", X=None, target=0, plot_interval=True, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot a time series with model forecasts.</p> <p>This plot is only available for forecasting tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. If no models are selected, only the target column is plotted. <p>fh: hashable, segment, sequence or ForecastingHorizon, default=\"test\" Forecast horizon for which to plot the predictions. <p>X: dataframe-like or None, default=None Exogenous time series corresponding to fh. This parameter is ignored if fh is a data set. <p>target: int or str, default=0 Target column to look at. Only for multivariate tasks. <p>plot_interval: bool, default=True Whether to plot prediction intervals instead of the exact prediction values. If True, the plotted estimators should have a <code>predict_interval</code> method. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_lift Plot the lift curve.</p> <p>plot_prc Plot the precision-recall curve.</p> <p>plot_roc Plot the Receiver Operating Characteristics curve.</p> <p></p>"}, {"location": "API/plots/plot_forecast/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; atom = ATOMForecaster(y, random_state=1)\n&gt;&gt;&gt; atom.plot_forecast()\n</code></pre> <pre><code>&gt;&gt;&gt; atom.run(\n...     models=\"arima\",\n...     est_params={\"order\": (1, 1, 0), \"seasonal_order\": (0, 1, 0, 12)},\n... )\n&gt;&gt;&gt; atom.plot_forecast()\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_forecast(fh=\"train+test\", plot_interval=False)\n</code></pre> <pre><code>&gt;&gt;&gt; # Forecast the next 4 years starting from the test set\n&gt;&gt;&gt; atom.plot_forecast(fh=range(1, 48))\n</code></pre>"}, {"location": "API/plots/plot_gains/", "title": "plot_gains", "text": "<p>method plot_gains(models=None, rows=\"test\", target=0, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the cumulative gains curve.</p> <p>This plot is available only for binary and multilabel classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_det Plot the Detection Error Tradeoff curve.</p> <p>plot_lift Plot the lift curve.</p> <p>plot_roc Plot the Receiver Operating Characteristics curve.</p> <p></p>"}, {"location": "API/plots/plot_gains/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_gains()\n</code></pre>"}, {"location": "API/plots/plot_hyperparameter_importance/", "title": "plot_hyperparameter_importance", "text": "<p>method plot_hyperparameter_importance(models=None, metric=0, show=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot a model's hyperparameter importance.</p> <p>The hyperparameter importances are calculated using the fANOVA importance evaluator. The sum of all importances for all parameters (per model) is 1. This plot is only available for models that ran hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected. <p>metric: int or str, default=0 Metric to plot (only for multi-metric runs). <p>show: int or None, default=None Number of hyperparameters (ordered by importance) to show. None to show all. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_feature_importance Plot a model's feature importance.</p> <p>plot_hyperparameters Plot hyperparameter relationships in a study.</p> <p>plot_trials Plot the hyperparameter tuning trials.</p> <p></p>"}, {"location": "API/plots/plot_hyperparameter_importance/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"ET\", \"RF\"], n_trials=10)\n&gt;&gt;&gt; atom.plot_hyperparameter_importance()\n</code></pre>"}, {"location": "API/plots/plot_hyperparameters/", "title": "plot_hyperparameters", "text": "<p>method plot_hyperparameters(models=None, params=(0, 1), metric=0, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot hyperparameter relationships in a study.</p> <p>A model's hyperparameters are plotted against each other. The corresponding metric scores are displayed in a contour plot. The markers are the trials in the study. This plot is only available for models that ran hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_hyperparameters()</code>. <p>params: str, segment or sequence, default=(0, 1) Hyperparameters to plot. Use a sequence or add <code>+</code> between options to select more than one. <p>metric: int or str, default=0 Metric to plot (only for multi-metric runs). <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_hyperparameter_importance Plot a model's hyperparameter importance.</p> <p>plot_parallel_coordinate Plot high-dimensional parameter relationships in a study.</p> <p>plot_trials Plot the hyperparameter tuning trials.</p> <p></p>"}, {"location": "API/plots/plot_hyperparameters/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\", n_trials=15)\n&gt;&gt;&gt; atom.plot_hyperparameters(params=(0, 1, 2))\n</code></pre>"}, {"location": "API/plots/plot_learning_curve/", "title": "plot_learning_curve", "text": "<p>method plot_learning_curve(models=None, metric=None, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the learning curve: score vs number of training samples.</p> <p>This plot is available only for models fitted using train sizing. Ensembles are ignored.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Use a sequence or add <code>+</code> between options to select more than one. If None, the metric used to run the pipeline is selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_results Plot the model results.</p> <p>plot_successive_halving Plot scores per iteration of the successive halving.</p> <p></p>"}, {"location": "API/plots/plot_learning_curve/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.train_sizing([\"LR\", \"RF\"], n_bootstrap=5)\n&gt;&gt;&gt; atom.plot_learning_curve()\n</code></pre>"}, {"location": "API/plots/plot_lift/", "title": "plot_lift", "text": "<p>method plot_lift(models=None, rows=\"test\", target=0, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the lift curve.</p> <p>Only available for binary classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_det Plot the Detection Error Tradeoff curve.</p> <p>plot_gains Plot the cumulative gains curve.</p> <p>plot_prc Plot the precision-recall curve.</p> <p></p>"}, {"location": "API/plots/plot_lift/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_lift()\n</code></pre>"}, {"location": "API/plots/plot_ngrams/", "title": "plot_ngrams", "text": "<p>method plot_ngrams(ngram=\"bigram\", rows=\"dataset\", show=10, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot n-gram frequencies.</p> <p>The text for the plot is extracted from the column named <code>corpus</code>. If there is no column with that name, an exception is raised. If the documents are not tokenized, the words are separated by spaces.</p> <p>Tip</p> <p>Use atom's tokenize method to separate the words creating n-grams based on their frequency in the corpus.</p> <p>Parametersngram: str or int, default=\"bigram\" Number of contiguous words to search for (size of n-gram). Choose from: word (1), bigram (2), trigram (3), quadgram (4). <p>rows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows in the corpus to include in the search. <p>show: int or None, default=10 Number of n-grams (ordered by number of occurrences) to show in the plot. If none, show all n-grams (up to 200). <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of n-grams shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_wordcloud Plot a wordcloud from the corpus.</p> <p></p>"}, {"location": "API/plots/plot_ngrams/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import fetch_20newsgroups\n\n&gt;&gt;&gt; X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n&gt;&gt;&gt; X = np.array(X).reshape(-1, 1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.textclean()\n&gt;&gt;&gt; atom.textnormalize()\n&gt;&gt;&gt; atom.plot_ngrams()\n</code></pre>"}, {"location": "API/plots/plot_parallel_coordinate/", "title": "plot_parallel_coordinate", "text": "<p>method plot_parallel_coordinate(models=None, params=None, metric=0, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot high-dimensional parameter relationships in a study.</p> <p>Every line of the plot represents one trial. This plot is only available for models that ran hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_parallel_coordinate()</code>. <p>params: str, segment, sequence or None, default=None Hyperparameters to plot. Use a sequence or add <code>+</code> between options to select more than one. If None, all the model's hyperparameters are selected. <p>metric: int or str, default=0 Metric to plot (only for multi-metric runs). <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_edf Plot the Empirical Distribution Function of a study.</p> <p>plot_hyperparameter_importance Plot a model's hyperparameter importance.</p> <p>plot_hyperparameters Plot hyperparameter relationships in a study.</p> <p></p>"}, {"location": "API/plots/plot_parallel_coordinate/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"RF\", n_trials=15)\n&gt;&gt;&gt; atom.plot_parallel_coordinate(params=slice(1, 5))\n</code></pre>"}, {"location": "API/plots/plot_pareto_front/", "title": "plot_pareto_front", "text": "<p>method plot_pareto_front(models=None, metric=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot the Pareto front of a study.</p> <p>Shows the trial scores plotted against each other. The marker's colors indicate the trial number. This plot is only available for models with multi-metric runs and hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_pareto_front()</code>. <p>metric: str, sequence or None, default=None Metrics to plot.  Use a sequence or add <code>+</code> between options to select more than one. If None, the metrics used to run the pipeline are selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of metrics shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_edf Plot the Empirical Distribution Function of a study.</p> <p>plot_slice Plot the parameter relationship in a study.</p> <p>plot_trials Plot the hyperparameter tuning trials.</p> <p></p>"}, {"location": "API/plots/plot_pareto_front/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\n...     models=\"RF\",\n...     metric=[\"f1\", \"accuracy\", \"recall\"],\n...     n_trials=15,\n...  )\n&gt;&gt;&gt; atom.plot_pareto_front()\n</code></pre>"}, {"location": "API/plots/plot_parshap/", "title": "plot_parshap", "text": "<p>method plot_parshap(models=None, columns=None, target=1, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot the partial correlation of shap values.</p> <p>Plots the train and test correlation between the shap value of every feature with its target value, after removing the effect of all other features (partial correlation). This plot is useful to identify the features that are contributing most to overfitting. Features that lie below the bisector (diagonal line) performed worse on the test set than on the training set. If the estimator has a <code>scores_</code>, <code>feature_importances_</code> or <code>coef_</code> attribute, its normalized values are shown in a color map.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>columns: int, str, segment, sequence, dataframe or None, default=None XSelector to plot. If None, it plots all features. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_feature_importance Plot a model's feature importance.</p> <p>plot_partial_dependence Plot the partial dependence of features.</p> <p>plot_permutation_importance Plot the feature permutation importance of models.</p> <p></p>"}, {"location": "API/plots/plot_parshap/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"GNB\", \"RF\"])\n&gt;&gt;&gt; atom.rf.plot_parshap(legend=None)\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_parshap(columns=slice(5, 10))\n</code></pre>"}, {"location": "API/plots/plot_partial_dependence/", "title": "plot_partial_dependence", "text": "<p>method plot_partial_dependence(models=None, columns=(0, 1, 2), kind=\"average\", pair=None, target=1, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the partial dependence of features.</p> <p>The partial dependence of a feature (or a set of features) corresponds to the response of the model for each possible value of the feature. The plot can take two forms:</p> <ul> <li>If <code>pair</code> is None: Single feature partial dependence lines.   The deciles of the feature values are shown with tick marks   on the bottom.</li> <li>If <code>pair</code> is defined: Two-way partial dependence plots are   plotted as contour plots (only allowed for a single model).</li> </ul> <p>Read more about partial dependence on sklearn's documentation. This plot is not available for multilabel nor multiclass-multioutput classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>columns: int, str, segment, sequence, dataframe, default=(0, 1, 2) XSelector to get the partial dependence from. <p>kind: str or sequence, default=\"average\" Kind of dependence to plot. Use a sequence or add <code>+</code> between options to select more than one. Choose from: <ul> <li>\"average\": Partial dependence averaged across all samples   in the dataset.</li> <li>\"individual\": Partial dependence for up to 50 random   samples (Individual Conditional Expectation).</li> </ul> <p>This parameter is ignored when plotting feature pairs.</p> <p>pair: int, str or None, default=None Feature with which to pair the features selected by <code>columns</code>. If specified, the resulting figure displays contour plots. Only allowed when plotting a single model. If None, the plots show the partial dependence of single features. <p>target: int or str, default=1 Class in the target column to look at (only for multiclass classification tasks). <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_feature_importance Plot a model's feature importance.</p> <p>plot_parshap Plot the partial correlation of shap values.</p> <p>plot_permutation_importance Plot the feature permutation importance of models.</p> <p></p>"}, {"location": "API/plots/plot_partial_dependence/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_partial_dependence(kind=\"average+individual\", legend=\"upper left\")\n</code></pre> <pre><code>&gt;&gt;&gt; atom.rf.plot_partial_dependence(columns=(3, 4), pair=2)\n</code></pre>"}, {"location": "API/plots/plot_pca/", "title": "plot_pca", "text": "<p>method plot_pca(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot the explained variance ratio vs number of components.</p> <p>If the underlying estimator is PCA (for dense datasets), all possible components are plotted. If the underlying estimator is TruncatedSVD (for sparse datasets), it only shows the selected components. The star marks the number of components selected by the user. This plot is available only when feature selection was applied with strategy=\"pca\".</p> <p>Parameterstitle: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_components Plot the explained variance ratio per component.</p> <p>plot_rfecv Plot the rfecv results.</p> <p></p>"}, {"location": "API/plots/plot_pca/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.feature_selection(\"pca\", n_features=5)\n&gt;&gt;&gt; atom.plot_pca()\n</code></pre>"}, {"location": "API/plots/plot_permutation_importance/", "title": "plot_permutation_importance", "text": "<p>method plot_permutation_importance(models=None, show=None, n_repeats=10, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the feature permutation importance of models.</p> <p>Warning</p> <p>This method can be slow. Results are cached to fasten repeated calls.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>n_repeats: int, default=10 Number of times to permute each feature. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_feature_importance Plot a model's feature importance.</p> <p>plot_partial_dependence Plot the partial dependence of features.</p> <p>plot_parshap Plot the partial correlation of shap values.</p> <p></p>"}, {"location": "API/plots/plot_permutation_importance/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_permutation_importance(show=10, n_repeats=7)\n</code></pre>"}, {"location": "API/plots/plot_pipeline/", "title": "plot_pipeline", "text": "<p>method plot_pipeline(models=None, draw_hyperparameter_tuning=True, color_branches=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot a diagram of the pipeline.</p> <p>Warning</p> <p>This plot uses the schemdraw package, which is incompatible with plotly. The returned plot is therefore a matplotlib figure.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models for which to draw the pipeline. If None, all pipelines are plotted. <p>draw_hyperparameter_tuning: bool, default=True Whether to draw if the models used Hyperparameter Tuning. <p>color_branches: bool or None, default=None Whether to draw every branch in a different color. If None, branches are colored when there is more than one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the pipeline drawn. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_wordcloud Plot a wordcloud from the corpus.</p> <p></p>"}, {"location": "API/plots/plot_pipeline/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"GNB\", \"RNN\", \"SGD\", \"MLP\"])\n&gt;&gt;&gt; atom.voting(models=atom.winners[:2])\n&gt;&gt;&gt; atom.plot_pipeline()\n</code></pre><pre><code>&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.scale()\n&gt;&gt;&gt; atom.prune()\n&gt;&gt;&gt; atom.run(\"RF\", n_trials=30)\n\n&gt;&gt;&gt; atom.branch = \"undersample\"\n&gt;&gt;&gt; atom.balance(\"nearmiss\")\n&gt;&gt;&gt; atom.run(\"RF_undersample\")\n\n&gt;&gt;&gt; atom.branch = \"oversample_from_main\"\n&gt;&gt;&gt; atom.balance(\"smote\")\n&gt;&gt;&gt; atom.run(\"RF_oversample\")\n\n&gt;&gt;&gt; atom.plot_pipeline()\n</code></pre>"}, {"location": "API/plots/plot_prc/", "title": "plot_prc", "text": "<p>method plot_prc(models=None, rows=\"test\", target=0, title=None, legend=\"lower left\", figsize=(900, 600), filename=None, display=True)[source]Plot the precision-recall curve.</p> <p>Read more about PRC in sklearn's documentation. Only available for binary classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_det Plot the Detection Error Tradeoff curve.</p> <p>plot_lift Plot the lift curve.</p> <p>plot_roc Plot the Receiver Operating Characteristics curve.</p> <p></p>"}, {"location": "API/plots/plot_prc/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_prc()\n</code></pre>"}, {"location": "API/plots/plot_probabilities/", "title": "plot_probabilities", "text": "<p>method plot_probabilities(models=None, rows=\"test\", target=1, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the probability distribution of the target classes.</p> <p>This plot is available only for models with a <code>predict_proba</code> method in classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: hashable, segment or sequence, default=\"test\" Selection of rows on which to calculate the metric. <p>target: int, str or tuple, default=1 Probability of being that class in the target column. For multioutput tasks, the value should be a tuple of the form (column, class). <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_confusion_matrix Plot a model's confusion matrix.</p> <p>plot_results Plot the model results.</p> <p>plot_threshold Plot metric performances against threshold values.</p> <p></p>"}, {"location": "API/plots/plot_probabilities/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_probabilities()\n</code></pre>"}, {"location": "API/plots/plot_qq/", "title": "plot_qq", "text": "<p>method plot_qq(columns=0, distributions=\"norm\", title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot a quantile-quantile plot.</p> <p>Columns are distinguished by color and the distributions are distinguished by marker type. Missing values are ignored.</p> <p>Parameterscolumns: int, str, slice or sequence, default=0 Columns to plot. Selected categorical columns are ignored. <p>distributions: str or sequence, default=\"norm\" Names of the <code>scipy.stats</code> distributions to fit to the columns. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_correlation Plot a correlation matrix.</p> <p>plot_distribution Plot column distributions.</p> <p>plot_relationships Plot pairwise relationships in a dataset.</p> <p></p>"}, {"location": "API/plots/plot_qq/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.plot_qq(columns=[5, 6])\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_qq(columns=0, distributions=[\"norm\", \"invgauss\", \"triang\"])\n</code></pre>"}, {"location": "API/plots/plot_relationships/", "title": "plot_relationships", "text": "<p>method plot_relationships(columns=(0, 1, 2), title=None, legend=None, figsize=(900, 900), filename=None, display=True)[source]Plot pairwise relationships in a dataset.</p> <p>Creates a grid of axes such that each numerical column appears once on the x-axes and once on the y-axes. The bottom triangle contains scatter plots (max 250 random samples), the diagonal plots contain column distributions, and the upper triangle contains contour histograms for all samples in the columns.</p> <p>Parameterscolumns: segment, sequence or dataframe, default=(0, 1, 2) Columns to plot. Selected categorical columns are ignored. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple, default=(900, 900) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_correlation Plot a correlation matrix.</p> <p>plot_distribution Plot column distributions.</p> <p>plot_qq Plot a quantile-quantile plot.</p> <p></p>"}, {"location": "API/plots/plot_relationships/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.plot_relationships(columns=[0, 4, 5])\n</code></pre>"}, {"location": "API/plots/plot_residuals/", "title": "plot_residuals", "text": "<p>method plot_residuals(models=None, rows=\"test\", target=0, title=None, legend=\"upper left\", figsize=(900, 600), filename=None, display=True)[source]Plot a model's residuals.</p> <p>The plot shows the residuals (difference between the predicted and the true value) on the vertical axis and the independent variable on the horizontal axis. The gray, intersected line shows the identity line. This plot can be useful to analyze the variance of the regressor's errors. If the points are randomly dispersed around the horizontal axis, a linear regression model is appropriate for the data; otherwise, a non-linear model is more appropriate. This plot is only available for regression tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multioutput tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_errors Plot a model's prediction errors.</p> <p></p>"}, {"location": "API/plots/plot_residuals/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_diabetes\n\n&gt;&gt;&gt; X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y)\n&gt;&gt;&gt; atom.run([\"OLS\", \"LGB\"])\n&gt;&gt;&gt; atom.plot_residuals()\n</code></pre>"}, {"location": "API/plots/plot_results/", "title": "plot_results", "text": "<p>method plot_results(models=None, metric=None, title=None, legend=\"lower right\", figsize=None, filename=None, display=True)[source]Plot the model results.</p> <p>If all models applied bootstrap, the plot is a boxplot. If not, the plot is a barplot. Models are ordered based on their score from the top down. The score is either the <code>[metric]_bootstrap</code> or <code>[metric]_test</code> values, selected in that order.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Other available options are: \"time_bo\", \"time_fit\", \"time_bootstrap\", \"time\". If str, add <code>+</code> between options to select more than one. If None, the metric used to run the pipeline is selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of models. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_confusion_matrix Plot a model's confusion matrix.</p> <p>plot_probabilities Plot the probability distribution of the target classes.</p> <p>plot_threshold Plot metric performances against threshold values.</p> <p></p>"}, {"location": "API/plots/plot_results/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"GNB\", \"LR\", \"RF\", \"LGB\"], metric=[\"f1\", \"recall\"])\n&gt;&gt;&gt; atom.plot_results()\n</code></pre> <pre><code>&gt;&gt;&gt; atom.run([\"GNB\", \"LR\", \"RF\", \"LGB\"], metric=[\"f1\", \"recall\"], n_bootstrap=5)\n&gt;&gt;&gt; atom.plot_results()\n</code></pre> <pre><code>&gt;&gt;&gt; atom.plot_results(metric=\"time_fit+time\")\n</code></pre>"}, {"location": "API/plots/plot_rfecv/", "title": "plot_rfecv", "text": "<p>method plot_rfecv(title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot the rfecv results.</p> <p>Plot the scores obtained by the estimator fitted on every subset of the dataset. Only available when feature selection was applied with strategy=\"rfecv\".</p> <p>Parameterstitle: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_components Plot the explained variance ratio per component.</p> <p>plot_pca Plot the explained variance ratio vs number of components.</p> <p></p>"}, {"location": "API/plots/plot_rfecv/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.feature_selection(\"rfecv\", solver=\"Tree\")\n&gt;&gt;&gt; atom.plot_rfecv()\n</code></pre>"}, {"location": "API/plots/plot_roc/", "title": "plot_roc", "text": "<p>method plot_roc(models=None, rows=\"test\", target=0, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the Receiver Operating Characteristics curve.</p> <p>Read more about ROC in sklearn's documentation. Only available for classification tasks.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>rows: str, sequence or dict, default=\"test\" Selection of rows on which to calculate the metric. <ul> <li>If str: Name of the data set to plot.</li> <li>If sequence: Names of the data sets to plot.</li> <li>If dict: Names of the sets with corresponding   selection of rows as values.</li> </ul> <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_gains Plot the cumulative gains curve.</p> <p>plot_lift Plot the lift curve.</p> <p>plot_prc Plot the precision-recall curve.</p> <p></p>"}, {"location": "API/plots/plot_roc/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_roc()\n</code></pre>"}, {"location": "API/plots/plot_shap_bar/", "title": "plot_shap_bar", "text": "<p>method plot_shap_bar(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's bar plot.</p> <p>Create a bar plot of a set of SHAP values. If a single sample is passed, then the SHAP values are plotted. If many samples are passed, then the mean absolute value for each feature column is plotted. Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_bar()</code>. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_parshap Plot the partial correlation of shap values.</p> <p>plot_shap_beeswarm Plot SHAP's beeswarm plot.</p> <p>plot_shap_scatter Plot SHAP's scatter plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_bar/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_bar(show=10)\n</code></pre>"}, {"location": "API/plots/plot_shap_beeswarm/", "title": "plot_shap_beeswarm", "text": "<p>method plot_shap_beeswarm(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's beeswarm plot.</p> <p>The plot is colored by feature values. Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_beeswarm()</code>. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. The plot_shap_beeswarm method does not support plotting a single sample. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_parshap Plot the partial correlation of shap values.</p> <p>plot_shap_bar Plot SHAP's bar plot.</p> <p>plot_shap_scatter Plot SHAP's scatter plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_beeswarm/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_beeswarm(show=10)\n</code></pre>"}, {"location": "API/plots/plot_shap_decision/", "title": "plot_shap_decision", "text": "<p>method plot_shap_decision(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's decision plot.</p> <p>Visualize model decisions using cumulative SHAP values. Each plotted line explains a single model prediction. If a single prediction is plotted, feature values are printed in the plot (if supplied). If multiple predictions are plotted together, feature values will not be printed. Plotting too many predictions together will make the plot unintelligible. Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_decision()</code>. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_shap_bar Plot SHAP's bar plot.</p> <p>plot_shap_beeswarm Plot SHAP's beeswarm plot.</p> <p>plot_shap_force Plot SHAP's force plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_decision/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_decision(show=10)\n</code></pre><pre><code>&gt;&gt;&gt; atom.plot_shap_decision(rows=-1, show=10)\n</code></pre>"}, {"location": "API/plots/plot_shap_force/", "title": "plot_shap_force", "text": "<p>method plot_shap_force(models=None, rows=\"test\", target=1, title=None, legend=None, figsize=(900, 300), filename=None, display=True, **kwargs)[source]Plot SHAP's force plot.</p> <p>Visualize the given SHAP values with an additive force layout. Note that by default this plot will render using javascript. For a regular figure use <code>matplotlib=True</code> (this option is only available when only a single sample is plotted). Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_force()</code>. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=(900, 300) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure (only if <code>matplotlib=True</code> in <code>kwargs</code>). <p>**kwargs Additional keyword arguments for shap.plots.force. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_shap_beeswarm Plot SHAP's beeswarm plot.</p> <p>plot_shap_scatter Plot SHAP's scatter plot.</p> <p>plot_shap_decision Plot SHAP's decision plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_force/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_force(rows=-2, matplotlib=True, figsize=(1800, 300))\n</code></pre>"}, {"location": "API/plots/plot_shap_heatmap/", "title": "plot_shap_heatmap", "text": "<p>method plot_shap_heatmap(models=None, rows=\"test\", show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's heatmap plot.</p> <p>This plot is designed to show the population substructure of a dataset using supervised clustering and a heatmap. Supervised clustering involves clustering data points not by their original feature values but by their explanations. Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_heatmap()</code>. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. The plot_shap_heatmap method does not support plotting a single sample. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_shap_decision Plot SHAP's decision plot.</p> <p>plot_shap_force Plot SHAP's force plot.</p> <p>plot_shap_waterfall Plot SHAP's waterfall plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_heatmap/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_heatmap(show=10)\n</code></pre>"}, {"location": "API/plots/plot_shap_scatter/", "title": "plot_shap_scatter", "text": "<p>method plot_shap_scatter(models=None, rows=\"test\", columns=0, target=1, title=None, legend=None, figsize=(900, 600), filename=None, display=True)[source]Plot SHAP's scatter plot.</p> <p>Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extension of the classical partial dependence plots. Vertical dispersion of the data points represents interaction effects. Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_scatter()</code>. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to plot. The plot_shap_scatter method does not support plotting a single sample. <p>columns: int or str, default=0 Column to plot. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_shap_beeswarm Plot SHAP's beeswarm plot.</p> <p>plot_shap_decision Plot SHAP's decision plot.</p> <p>plot_shap_force Plot SHAP's force plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_scatter/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_scatter(columns=\"symmetry error\")\n</code></pre>"}, {"location": "API/plots/plot_shap_waterfall/", "title": "plot_shap_waterfall", "text": "<p>method plot_shap_waterfall(models=None, rows=0, show=None, target=1, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot SHAP's waterfall plot.</p> <p>The SHAP value of a feature represents the impact of the evidence provided by that feature on the model\u2019s output. The waterfall plot is designed to visually display how the SHAP values (evidence) of each feature move the model output from our prior expectation under the background data distribution, to the final model prediction given the evidence of all the features. Features are sorted by the magnitude of their SHAP values with the smallest magnitude features grouped together at the bottom of the plot when the number of features in the models exceeds the <code>show</code> parameter. Read more about SHAP plots in the user guide.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_waterfall()</code>. <p>rows: int or str, default=0 Selection of rows to plot. The plot_shap_waterfall method does not support plotting multiple samples. <p>show: int or None, default=None Number of features (ordered by importance) to show. If None, it shows all features. <p>target: int, str or tuple, default=1 Class in the target column to target. For multioutput tasks, the value should be a tuple of the form (column, class). Note that for binary and multilabel tasks, the selected class is always the positive one. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of features shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as png. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsplt.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_shap_bar Plot SHAP's bar plot.</p> <p>plot_shap_beeswarm Plot SHAP's beeswarm plot.</p> <p>plot_shap_heatmap Plot SHAP's heatmap plot.</p> <p></p>"}, {"location": "API/plots/plot_shap_waterfall/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"LR\")\n&gt;&gt;&gt; atom.plot_shap_waterfall(show=10)\n</code></pre>"}, {"location": "API/plots/plot_slice/", "title": "plot_slice", "text": "<p>method plot_slice(models=None, params=None, metric=None, title=None, legend=None, figsize=None, filename=None, display=True)[source]Plot the parameter relationship in a study.</p> <p>The color of the markers indicates the trial. This plot is only available for models that ran hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model or None, default=None Model to plot. If None, all models are selected. Note that leaving the default option could raise an exception if there are multiple models. To avoid this, call the plot directly from a model, e.g., <code>atom.lr.plot_slice()</code>. <p>params: str, segment, sequence or None, default=None Hyperparameters to plot. Use a sequence or add <code>+</code> between options to select more than one. If None, all the model's hyperparameters are selected. <p>metric: int or str, default=None Metric to plot (only for multi-metric runs). If str, add <code>+</code> between options to select more than one. If None, the metric used to run the pipeline is selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of hyperparameters shown. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_edf Plot the Empirical Distribution Function of a study.</p> <p>plot_hyperparameters Plot hyperparameter relationships in a study.</p> <p>plot_parallel_coordinate Plot high-dimensional parameter relationships in a study.</p> <p></p>"}, {"location": "API/plots/plot_slice/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\n...     models=\"RF\",\n...     metric=[\"f1\", \"recall\"],\n...     n_trials=15,\n... )\n&gt;&gt;&gt; atom.plot_slice(params=(0, 1, 2))\n</code></pre>"}, {"location": "API/plots/plot_successive_halving/", "title": "plot_successive_halving", "text": "<p>method plot_successive_halving(models=None, metric=None, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot scores per iteration of the successive halving.</p> <p>Only use with models fitted using successive halving. Ensembles are ignored.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Use a sequence or add <code>+</code> between options to select more than one. If None, the metric used to run the pipeline is selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_learning_curve Plot the learning curve: score vs number of training samples.</p> <p>plot_results Plot the model results.</p> <p></p>"}, {"location": "API/plots/plot_successive_halving/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.successive_halving([\"Tree\", \"Bag\", \"RF\", \"LGB\"], n_bootstrap=5)\n&gt;&gt;&gt; atom.plot_successive_halving()\n</code></pre>"}, {"location": "API/plots/plot_terminator_improvement/", "title": "plot_terminator_improvement", "text": "<p>method plot_terminator_improvement(models=None, title=None, legend=\"upper right\", figsize=(900, 600), filename=None, display=True)[source]Plot the potentials for future objective improvement.</p> <p>This function visualizes the objective improvement potentials. It helps to determine whether you should continue the optimization or not. The evaluated error is also plotted. Note that this function may take some time to compute the improvement potentials. This plot is only available for models that ran hyperparameter tuning.</p> <p>Warning</p> <ul> <li>The plot_terminator_improvement method is only available   for models that ran hyperparameter tuning using   cross-validation, e.g., using <code>ht_params={'cv': 5}</code>.</li> <li>This method does not support   [multi-objective optimizations][multi-metric runs].</li> <li>The calculation of the improvement can be slow. Set the   <code>memory</code> parameter to cache the   results and speed up repeated calls.</li> </ul> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y) <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_pareto_front Plot the Pareto front of a study.</p> <p>plot_timeline Plot the timeline of a study.</p> <p>plot_trials Plot the hyperparameter tuning trials.</p> <p></p>"}, {"location": "API/plots/plot_terminator_improvement/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\"RF\", n_trials=10, ht_params={\"cv\": 5})\n&gt;&gt;&gt; atom.plot_terminator_improvement()\n</code></pre>"}, {"location": "API/plots/plot_threshold/", "title": "plot_threshold", "text": "<p>method plot_threshold(models=None, metric=None, rows=\"test\", target=0, steps=100, title=None, legend=\"lower left\", figsize=(900, 600), filename=None, display=True)[source]Plot metric performances against threshold values.</p> <p>This plot is available only for models with a <code>predict_proba</code> method in a binary or multilabel classification task.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models are selected. <p>metric: str, func, scorer, sequence or None, default=None Metric to plot. Choose from any of sklearn's scorers, a function with signature <code>metric(y_true, y_pred)</code>, a scorer object or a sequence of these. Use a sequence or add <code>+</code> between options to select more than one. If None, the metric used to run the pipeline is selected. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows on which to calculate the metric. <p>target: int or str, default=0 Target column to look at. Only for multilabel tasks. <p>steps: int, default=100 Number of thresholds measured. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_calibration Plot the calibration curve for a binary classifier.</p> <p>plot_confusion_matrix Plot a model's confusion matrix.</p> <p>plot_probabilities Plot the probability distribution of the target classes.</p> <p></p>"}, {"location": "API/plots/plot_threshold/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"LR\", \"RF\"])\n&gt;&gt;&gt; atom.plot_threshold()\n</code></pre>"}, {"location": "API/plots/plot_timeline/", "title": "plot_timeline", "text": "<p>method plot_timeline(models=None, title=None, legend=\"lower right\", figsize=(900, 600), filename=None, display=True)[source]Plot the timeline of a study.</p> <p>This plot is only available for models that ran hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"lower right\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y) <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_edf Plot the Empirical Distribution Function of a study.</p> <p>plot_slice Plot the parameter relationship in a study.</p> <p>plot_terminator_improvement Plot the potentials for future objective improvement.</p> <p></p>"}, {"location": "API/plots/plot_timeline/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from optuna.pruners import PatientPruner\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run(\n...     models=\"LGB\",\n...     n_trials=15,\n...     ht_params={\"pruner\": PatientPruner(None, patience=2)},\n... )\n&gt;&gt;&gt; atom.plot_timeline()\n</code></pre>"}, {"location": "API/plots/plot_trials/", "title": "plot_trials", "text": "<p>method plot_trials(models=None, metric=None, title=None, legend=\"upper left\", figsize=(900, 800), filename=None, display=True)[source]Plot the hyperparameter tuning trials.</p> <p>Creates a figure with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. The best trial is indicated with a star. This is the same plot as produced by <code>ht_params={\"plot\": True}</code>. This plot is only available for models that ran hyperparameter tuning.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to plot. If None, all models that used hyperparameter tuning are selected. <p>metric: int, str, sequence or None, default=None Metric to plot (only for multi-metric runs). Add <code>+</code> between options to select more than one. If None, all metrics are selected. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=\"upper left\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple, default=(900, 800) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_evals Plot evaluation curves.</p> <p>plot_hyperparameters Plot hyperparameter relationships in a study.</p> <p>plot_results Plot the model results.</p> <p></p>"}, {"location": "API/plots/plot_trials/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=100, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.run([\"ET\", \"RF\"], n_trials=15)\n&gt;&gt;&gt; atom.plot_trials()\n</code></pre>"}, {"location": "API/plots/plot_wordcloud/", "title": "plot_wordcloud", "text": "<p>method plot_wordcloud(rows=\"dataset\", title=None, legend=None, figsize=(900, 600), filename=None, display=True, **kwargs)[source]Plot a wordcloud from the corpus.</p> <p>The text for the plot is extracted from the column named <code>corpus</code>. If there is no column with that name, an exception is raised.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"dataset\" Selection of rows in the corpus to include in the wordcloud. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: str, dict or None, default=None Do nothing. Implemented for continuity of the API. <p>figsize: tuple, default=(900, 600) Figure's size in pixels, format as (x, y). <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool or None, default=True Whether to render the plot. If None, it returns the figure. <p>**kwargs Additional keyword arguments for the Wordcloud object. <p>Returnsgo.Figure or None Plot object. Only returned if <code>display=None</code>. <p></p> <p></p> <p>See Also</p> <p>plot_ngrams Plot n-gram frequencies.</p> <p>plot_pipeline Plot a diagram of the pipeline.</p> <p></p>"}, {"location": "API/plots/plot_wordcloud/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; import numpy as np\n&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import fetch_20newsgroups\n\n&gt;&gt;&gt; X, y = fetch_20newsgroups(\n...     return_X_y=True,\n...     categories=[\"alt.atheism\", \"sci.med\", \"comp.windows.x\"],\n...     shuffle=True,\n...     random_state=1,\n... )\n&gt;&gt;&gt; X = np.array(X).reshape(-1, 1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y, random_state=1)\n&gt;&gt;&gt; atom.textclean()\n&gt;&gt;&gt; atom.textnormalize()\n&gt;&gt;&gt; atom.plot_wordcloud()\n</code></pre>"}, {"location": "API/training/directclassifier/", "title": "DirectClassifier", "text": "<p>class atom.training.DirectClassifier(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a direct fashion.</p> <p>The following steps are applied to every model:</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, a default metric is selected for every task: <ul> <li>\"f1\" for binary classification</li> <li>\"f1_weighted\" for multiclass(-multioutput) classification</li> <li>\"average_precision\" for multilabel classification</li> </ul> <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMClassifier Main class for classification tasks.</p> <p>SuccessiveHalvingClassifier Train and evaluate the models in a successive halving fashion.</p> <p>TrainSizingClassifier Train and evaluate the models in a train sizing fashion.</p> <p></p>"}, {"location": "API/training/directclassifier/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import DirectClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n&gt;&gt;&gt; from sklearn.model_selection import train_test_split\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n&gt;&gt;&gt; runner = DirectClassifier(models=[\"LR\", \"RF\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nModels: LR, RF\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.992\nTest evaluation --&gt; f1: 0.9767\nTime elapsed: 0.104s\n-------------------------------------------------\nTime: 0.104s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.968\nTime elapsed: 0.204s\n-------------------------------------------------\nTime: 0.204s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.314s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9767 !\nRandomForest       --&gt; f1: 0.968\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n    f1_train  f1_test  time_fit      time\nLR     0.992   0.9767  0.104497  0.104497\nRF     1.000   0.9680  0.204185  0.204185\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n    accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR    0.9708  0.9976  0.9702  0.9767   0.9545  0.9374     0.9813  0.9722  0.9959\nRF    0.9591  0.9490  0.9511  0.9680   0.9381  0.9118     0.9550  0.9815  0.9511\n</code></pre>"}, {"location": "API/training/directclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/directclassifier/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/directclassifier/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/directclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/directclassifier/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/directclassifier/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/directforecaster/", "title": "DirectForecaster", "text": "<p>class atom.training.DirectForecaster(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a direct fashion.</p> <p>The following steps are applied to every model:</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, the default metric <code>mean_absolute_percentage_error</code> is selected. <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMForecaster Main class for forecasting tasks.</p> <p>SuccessiveHalvingForecaster Train and evaluate the models in a successive halving fashion.</p> <p>TrainSizingForecaster Train and evaluate the models in a train sizing fashion.</p> <p></p>"}, {"location": "API/training/directforecaster/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import DirectForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n&gt;&gt;&gt; from sktime.split import temporal_train_test_split\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; train, test = temporal_train_test_split(y, test_size=0.2)\n\n&gt;&gt;&gt; runner = DirectForecaster(models=[\"ES\", \"ETS\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nModels: ES, ETS\nMetric: mape\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0868\nTest evaluation --&gt; mape: -0.2018\nTime elapsed: 0.019s\n-------------------------------------------------\nTime: 0.019s\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0863\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.041s\n-------------------------------------\nExponentialSmoothing --&gt; mape: -0.2018 !\nETS                  --&gt; mape: -0.202\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n     mape_train  mape_test  time_fit      time\nES      -0.0868    -0.2018  0.019017  0.019017\nETS     -0.0863    -0.2020  0.020018  0.020018\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n         mae    mape        mse      r2     rmse\nES  -81.3862 -0.2018 -8661.7730 -0.4189 -93.0686\nETS -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\n</code></pre>"}, {"location": "API/training/directforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/directforecaster/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/directforecaster/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/directforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/directforecaster/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/directforecaster/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/directregressor/", "title": "DirectRegressor", "text": "<p>class atom.training.DirectRegressor(models=None, metric=None, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a direct fashion.</p> <p>The following steps are applied to every model:</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, the default metric <code>r2</code> is selected. <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMRegressor Main class for regression tasks.</p> <p>SuccessiveHalvingRegressor Train and evaluate the models in a successive halving fashion.</p> <p>TrainSizingRegressor Train and evaluate the models in a train sizing fashion.</p> <p></p>"}, {"location": "API/training/directregressor/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import DirectRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_digits\n&gt;&gt;&gt; from sklearn.model_selection import train_test_split\n\n&gt;&gt;&gt; X, y = load_digits(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n&gt;&gt;&gt; runner = DirectRegressor(models=[\"OLS\", \"RF\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nModels: OLS, RF\nMetric: r2\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.5991\nTest evaluation --&gt; r2: 0.5765\nTime elapsed: 0.154s\n-------------------------------------------------\nTime: 0.154s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.9803\nTest evaluation --&gt; r2: 0.8803\nTime elapsed: 1.594s\n-------------------------------------------------\nTime: 1.594s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.749s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: 0.5765\nRandomForest         --&gt; r2: 0.8803 !\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n     r2_train  r2_test  time_fit      time\nOLS    0.5991   0.5765  0.153989  0.153989\nRF     0.9803   0.8803  1.594449  1.594449\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n        mae          mape     mse      r2    rmse\nOLS -1.4553 -9.184808e+14 -3.4564  0.5765 -1.8591\nRF  -0.6098 -2.854782e+14 -0.9773  0.8803 -0.9886\n</code></pre>"}, {"location": "API/training/directregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/directregressor/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/directregressor/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/directregressor/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/directregressor/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/directregressor/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/successivehalvingclassifier/", "title": "SuccessiveHalvingClassifier", "text": "<p>class atom.training.SuccessiveHalvingClassifier(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a successive halving fashion.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, a default metric is selected for every task: <ul> <li>\"f1\" for binary classification</li> <li>\"f1_weighted\" for multiclass(-multioutput) classification</li> <li>\"average_precision\" for multilabel classification</li> </ul> <p>skip_runs: int, default=0 Skip last <code>skip_runs</code> runs of the successive halving. <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMClassifier Main class for classification tasks.</p> <p>DirectClassifier Train and evaluate the models in a direct fashion.</p> <p>TrainSizingClassifier Train and evaluate the models in a train sizing fashion.</p> <p></p>"}, {"location": "API/training/successivehalvingclassifier/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import SuccessiveHalvingClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n&gt;&gt;&gt; from sklearn.model_selection import train_test_split\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n&gt;&gt;&gt; runner = SuccessiveHalvingClassifier([\"LR\", \"RF\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nMetric: f1\n\n\nRun: 0 =========================== &gt;&gt;\nModels: LR2, RF2\nSize of training set: 398 (50%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.996\nTest evaluation --&gt; f1: 0.9677\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.9444\nTime elapsed: 0.137s\n-------------------------------------------------\nTime: 0.137s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.228s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9677 !\nRandomForest       --&gt; f1: 0.9444\n\n\nRun: 1 =========================== &gt;&gt;\nModels: LR1\nSize of training set: 398 (100%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.994\nTest evaluation --&gt; f1: 0.9818\nTime elapsed: 0.095s\n-------------------------------------------------\nTime: 0.095s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.098s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9818\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n            f1_train  f1_test  time_fit      time\nfrac model                                       \n0.5  LR2       0.996   0.9677  0.086078  0.086078\n     RF2       1.000   0.9444  0.137125  0.137125\n1.0  LR1       0.994   0.9818  0.094800  0.094800\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n     accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR2    0.9591  0.9963  0.9609  0.9677   0.9375  0.9124     0.9813  0.9545  0.9937\nRF2    0.9298  0.9391  0.9308  0.9444   0.8947  0.8504     0.9623  0.9273  0.9308\nLR1    0.9766  0.9972  0.9745  0.9818   0.9643  0.9490     0.9818  0.9818  0.9952\n</code></pre>"}, {"location": "API/training/successivehalvingclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/successivehalvingclassifier/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/successivehalvingclassifier/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/successivehalvingclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/successivehalvingclassifier/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/successivehalvingclassifier/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/successivehalvingforecaster/", "title": "SuccessiveHalvingForecaster", "text": "<p>class atom.training.SuccessiveHalvingForecaster(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a successive halving fashion.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, the default metric <code>mean_absolute_percentage_error</code> is selected. <p>skip_runs: int, default=0 Skip last <code>skip_runs</code> runs of the successive halving. <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMForecaster Main class for forecasting tasks.</p> <p>DirectForecaster Train and evaluate the models in a direct fashion.</p> <p>TrainSizingForecaster Train and evaluate the models in a train sizing fashion.</p> <p></p>"}, {"location": "API/training/successivehalvingforecaster/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import SuccessiveHalvingForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n&gt;&gt;&gt; from sktime.split import temporal_train_test_split\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; train, test = temporal_train_test_split(y, test_size=0.2)\n\n&gt;&gt;&gt; runner = SuccessiveHalvingForecaster([\"ETS\", \"ES\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nMetric: mape\n\n\nRun: 0 =========================== &gt;&gt;\nModels: ETS2, ES2\nSize of training set: 115 (50%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0879\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0879\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.017s\n-------------------------------------------------\nTime: 0.017s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.039s\n-------------------------------------\nETS                  --&gt; mape: -0.202 !\nExponentialSmoothing --&gt; mape: -0.202 !\n\n\nRun: 1 =========================== &gt;&gt;\nModels: ETS1\nSize of training set: 115 (100%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0863\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.021s\n-------------------------------------\nETS --&gt; mape: -0.202\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n            mape_train  mape_test  time_fit      time\nfrac model                                           \n0.5  ES2       -0.0879     -0.202  0.017015  0.017015\n     ETS2      -0.0879     -0.202  0.020018  0.020018\n1.0  ETS1      -0.0863     -0.202  0.020018  0.020018\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n          mae   mape        mse      r2     rmse\nETS2 -81.4454 -0.202 -8673.3633 -0.4208 -93.1309\nES2  -81.4483 -0.202 -8673.9309 -0.4209 -93.1339\nETS1 -81.4454 -0.202 -8673.3633 -0.4208 -93.1309\n</code></pre>"}, {"location": "API/training/successivehalvingforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/successivehalvingforecaster/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/successivehalvingforecaster/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/successivehalvingforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/successivehalvingforecaster/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/successivehalvingforecaster/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/successivehalvingregressor/", "title": "SuccessiveHalvingRegressor", "text": "<p>class atom.training.SuccessiveHalvingRegressor(models=None, metric=None, skip_runs=0, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a successive halving fashion.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, the default metric <code>r2</code> is selected. <p>skip_runs: int, default=0 Skip last <code>skip_runs</code> runs of the successive halving. <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMRegressor Main class for regression tasks.</p> <p>DirectRegressor Train and evaluate the models in a direct fashion.</p> <p>TrainSizingRegressor Train and evaluate the models in a train sizing fashion.</p> <p></p>"}, {"location": "API/training/successivehalvingregressor/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import SuccessiveHalvingRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_digits\n&gt;&gt;&gt; from sklearn.model_selection import train_test_split\n\n&gt;&gt;&gt; X, y = load_digits(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n&gt;&gt;&gt; runner = SuccessiveHalvingRegressor([\"OLS\", \"RF\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nMetric: r2\n\n\nRun: 0 =========================== &gt;&gt;\nModels: OLS2, RF2\nSize of training set: 1257 (50%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6083\nTest evaluation --&gt; r2: -2.168057727555873e+23\nTime elapsed: 0.146s\n-------------------------------------------------\nTime: 0.146s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.9685\nTest evaluation --&gt; r2: 0.7924\nTime elapsed: 0.913s\n-------------------------------------------------\nTime: 0.913s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.061s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: -2.168057727555873e+23 ~\nRandomForest         --&gt; r2: 0.7924 !\n\n\nRun: 1 =========================== &gt;&gt;\nModels: RF1\nSize of training set: 1257 (100%)\nSize of test set: 540\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.9802\nTest evaluation --&gt; r2: 0.8692\nTime elapsed: 1.571s\n-------------------------------------------------\nTime: 1.571s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.573s\n-------------------------------------\nRandomForest --&gt; r2: 0.8692\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n            r2_train       r2_test  time_fit      time\nfrac model                                            \n0.5  OLS2     0.6083 -2.168058e+23  0.146151  0.146151\n     RF2      0.9685  7.924000e-01  0.912829  0.912829\n1.0  RF1      0.9802  8.692000e-01  1.571428  1.571428\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n               mae          mape           mse            r2          rmse\nOLS2 -1.375810e+11 -6.979478e+14 -1.715067e+24 -2.168058e+23 -1.309606e+12\nRF2  -8.656000e-01 -3.503634e+14 -1.642300e+00  7.924000e-01 -1.281500e+00\nRF1  -6.385000e-01 -1.768080e+14 -1.034400e+00  8.692000e-01 -1.017000e+00\n</code></pre>"}, {"location": "API/training/successivehalvingregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/successivehalvingregressor/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/successivehalvingregressor/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/successivehalvingregressor/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/successivehalvingregressor/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/successivehalvingregressor/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/trainsizingclassifier/", "title": "TrainSizingClassifier", "text": "<p>class atom.training.TrainSizingClassifier(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a train sizing fashion.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, a default metric is selected for every task: <ul> <li>\"f1\" for binary classification</li> <li>\"f1_weighted\" for multiclass(-multioutput) classification</li> <li>\"average_precision\" for multilabel classification</li> </ul> <p>train_sizes: int or sequence, default=5 Training set sizes used to run the trainings. <ul> <li>If int: Number of equally distributed splits, i.e., for a   value <code>N</code>, it's equal to <code>np.linspace(1.0/N, 1.0, N)</code>.</li> <li>If sequence: Fraction of the training set when &lt;=1, else   total number of samples.</li> </ul> <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMRegressor Main class for regression tasks.</p> <p>DirectRegressor Train and evaluate the models in a direct fashion.</p> <p>SuccessiveHalvingRegressor Train and evaluate the models in a successive halving fashion.</p> <p></p>"}, {"location": "API/training/trainsizingclassifier/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import TrainSizingClassifier\n&gt;&gt;&gt; from sklearn.datasets import load_breast_cancer\n&gt;&gt;&gt; from sklearn.model_selection import train_test_split\n\n&gt;&gt;&gt; X, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n&gt;&gt;&gt; runner = TrainSizingClassifier(models=\"LR\", verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nMetric: f1\n\n\nRun: 0 =========================== &gt;&gt;\nModels: LR02\nSize of training set: 79 (20%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9899\nTest evaluation --&gt; f1: 0.9455\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.089s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9455\n\n\nRun: 1 =========================== &gt;&gt;\nModels: LR04\nSize of training set: 159 (40%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9898\nTest evaluation --&gt; f1: 0.9727\nTime elapsed: 0.086s\n-------------------------------------------------\nTime: 0.086s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.088s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9727\n\n\nRun: 2 =========================== &gt;&gt;\nModels: LR06\nSize of training set: 238 (60%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9936\nTest evaluation --&gt; f1: 0.9683\nTime elapsed: 0.085s\n-------------------------------------------------\nTime: 0.085s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.088s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9683\n\n\nRun: 3 =========================== &gt;&gt;\nModels: LR08\nSize of training set: 318 (80%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9901\nTest evaluation --&gt; f1: 0.9817\nTime elapsed: 0.096s\n-------------------------------------------------\nTime: 0.096s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.099s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9817\n\n\nRun: 4 =========================== &gt;&gt;\nModels: LR10\nSize of training set: 398 (100%)\nSize of test set: 171\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.992\nTest evaluation --&gt; f1: 0.9772\nTime elapsed: 0.099s\n-------------------------------------------------\nTime: 0.099s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.102s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9772\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n            f1_train  f1_test  time_fit      time\nfrac model                                       \n0.2  LR02     0.9899   0.9455  0.086078  0.086078\n0.4  LR04     0.9898   0.9727  0.086078  0.086078\n0.6  LR06     0.9936   0.9683  0.085077  0.085077\n0.8  LR08     0.9901   0.9817  0.095865  0.095865\n1.0  LR10     0.9920   0.9772  0.098852  0.098852\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n      accuracy      ap      ba      f1  jaccard     mcc  precision  recall     auc\nLR02    0.9298  0.9916  0.9180  0.9455   0.8966  0.8483     0.9286  0.9630  0.9857\nLR04    0.9649  0.9971  0.9557  0.9727   0.9469  0.9248     0.9554  0.9907  0.9950\nLR06    0.9591  0.9976  0.9478  0.9683   0.9386  0.9124     0.9469  0.9907  0.9959\nLR08    0.9766  0.9963  0.9716  0.9817   0.9640  0.9497     0.9727  0.9907  0.9938\nLR10    0.9708  0.9973  0.9636  0.9772   0.9554  0.9372     0.9640  0.9907  0.9954\n</code></pre>"}, {"location": "API/training/trainsizingclassifier/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/trainsizingclassifier/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/trainsizingclassifier/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/trainsizingclassifier/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/trainsizingclassifier/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/trainsizingclassifier/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/trainsizingforecaster/", "title": "TrainSizingForecaster", "text": "<p>class atom.training.TrainSizingForecaster(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a train sizing fashion.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, the default metric <code>mean_absolute_percentage_error</code> is selected. <p>train_sizes: int or sequence, default=5 Training set sizes used to run the trainings. <ul> <li>If int: Number of equally distributed splits, i.e., for a   value <code>N</code>, it's equal to <code>np.linspace(1.0/N, 1.0, N)</code>.</li> <li>If sequence: Fraction of the training set when &lt;=1, else   total number of samples.</li> </ul> <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMForecaster Main class for forecasting tasks.</p> <p>DirectForecaster Train and evaluate the models in a direct fashion.</p> <p>SuccessiveHalvingForecaster Train and evaluate the models in a successive halving fashion.</p> <p></p>"}, {"location": "API/training/trainsizingforecaster/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import TrainSizingForecaster\n&gt;&gt;&gt; from sktime.datasets import load_airline\n&gt;&gt;&gt; from sktime.split import temporal_train_test_split\n\n&gt;&gt;&gt; y = load_airline()\n\n&gt;&gt;&gt; train, test = temporal_train_test_split(y, test_size=0.2)\n\n&gt;&gt;&gt; runner = TrainSizingForecaster([\"ETS\", \"ES\"], verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nMetric: mape\n\n\nRun: 0 =========================== &gt;&gt;\nModels: ETS02, ES02\nSize of training set: 23 (20%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0889\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.021s\n-------------------------------------------------\nTime: 0.021s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0889\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.017s\n-------------------------------------------------\nTime: 0.017s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.041s\n-------------------------------------\nETS                  --&gt; mape: -0.202 !\nExponentialSmoothing --&gt; mape: -0.202 !\n\n\nRun: 1 =========================== &gt;&gt;\nModels: ETS04, ES04\nSize of training set: 46 (40%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0871\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.019s\n-------------------------------------------------\nTime: 0.019s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0871\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.039s\n-------------------------------------\nETS                  --&gt; mape: -0.202 !\nExponentialSmoothing --&gt; mape: -0.202 !\n\n\nRun: 2 =========================== &gt;&gt;\nModels: ETS06, ES06\nSize of training set: 69 (60%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0861\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0867\nTest evaluation --&gt; mape: -0.2016\nTime elapsed: 0.017s\n-------------------------------------------------\nTime: 0.017s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.038s\n-------------------------------------\nETS                  --&gt; mape: -0.202\nExponentialSmoothing --&gt; mape: -0.2016 !\n\n\nRun: 3 =========================== &gt;&gt;\nModels: ETS08, ES08\nSize of training set: 92 (80%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0842\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0845\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.040s\n-------------------------------------\nETS                  --&gt; mape: -0.202 !\nExponentialSmoothing --&gt; mape: -0.202 !\n\n\nRun: 4 =========================== &gt;&gt;\nModels: ETS10, ES10\nSize of training set: 115 (100%)\nSize of test set: 29\n\n\nResults for ETS:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0863\nTest evaluation --&gt; mape: -0.202\nTime elapsed: 0.020s\n-------------------------------------------------\nTime: 0.020s\n\n\nResults for ExponentialSmoothing:\nFit ---------------------------------------------\nTrain evaluation --&gt; mape: -0.0868\nTest evaluation --&gt; mape: -0.2018\nTime elapsed: 0.018s\n-------------------------------------------------\nTime: 0.018s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.040s\n-------------------------------------\nETS                  --&gt; mape: -0.202\nExponentialSmoothing --&gt; mape: -0.2018 !\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n            mape_train  mape_test  time_fit      time\nfrac model                                           \n0.2  ES02      -0.0889    -0.2020  0.017015  0.017015\n     ETS02     -0.0889    -0.2020  0.021020  0.021020\n0.4  ES04      -0.0871    -0.2020  0.018016  0.018016\n     ETS04     -0.0871    -0.2020  0.019017  0.019017\n0.6  ES06      -0.0867    -0.2016  0.017015  0.017015\n     ETS06     -0.0861    -0.2020  0.020019  0.020019\n0.8  ES08      -0.0845    -0.2020  0.018016  0.018016\n     ETS08     -0.0842    -0.2020  0.020018  0.020018\n1.0  ES10      -0.0868    -0.2018  0.018016  0.018016\n     ETS10     -0.0863    -0.2020  0.020018  0.020018\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n           mae    mape        mse      r2     rmse\nETS02 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES02  -81.4444 -0.2020 -8673.1766 -0.4208 -93.1299\nETS04 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES04  -81.4483 -0.2020 -8673.9309 -0.4209 -93.1339\nETS06 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES06  -81.3025 -0.2016 -8645.4416 -0.4162 -92.9809\nETS08 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES08  -81.4483 -0.2020 -8673.9309 -0.4209 -93.1339\nETS10 -81.4454 -0.2020 -8673.3633 -0.4208 -93.1309\nES10  -81.3862 -0.2018 -8661.7730 -0.4189 -93.0686\n</code></pre>"}, {"location": "API/training/trainsizingforecaster/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/trainsizingforecaster/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/trainsizingforecaster/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/trainsizingforecaster/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/trainsizingforecaster/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/trainsizingforecaster/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "API/training/trainsizingregressor/", "title": "TrainSizingRegressor", "text": "<p>class atom.training.TrainSizingRegressor(models=None, metric=None, train_sizes=5, est_params=None, n_trials=0, ht_params=None, n_bootstrap=0, parallel=False, errors=\"skip\", n_jobs=1, device=\"cpu\", engine={'data': 'numpy', 'estimator': 'sklearn'}, backend=\"loky\", memory=False, verbose=0, warnings=False, logger=None, experiment=None, random_state=None)[source]Train and evaluate the models in a train sizing fashion.</p> <p>The following steps are applied to every model (per iteration):</p> <ol> <li>Apply hyperparameter tuning (optional).</li> <li>Fit the model on the training set using the best combination    of hyperparameters found.</li> <li>Evaluate the model on the test set.</li> <li>Train the estimator on various bootstrapped    samples of the training set and evaluate again on the test set    (optional).</li> </ol> <p>Parametersmodels: str, estimator or sequence, default=None Models to fit to the data. Allowed inputs are: an acronym from any of the predefined models, an ATOMModel or a custom predictor as class or instance. If None, all the predefined models are used. <p>metric: str, func, scorer, sequence or None, default=None Metric on which to fit the models. Choose from any of sklearn's scorers, a function with signature <code>function(y_true, y_pred, **kwargs) -&gt; score</code>, a scorer object or a sequence of these. If None, the default metric <code>r2</code> is selected. <p>train_sizes: int or sequence, default=5 Training set sizes used to run the trainings. <ul> <li>If int: Number of equally distributed splits, i.e., for a   value <code>N</code>, it's equal to <code>np.linspace(1.0/N, 1.0, N)</code>.</li> <li>If sequence: Fraction of the training set when &lt;=1, else   total number of samples.</li> </ul> <p>n_trials: int, dict or sequence, default=0 Maximum number of iterations for the hyperparameter tuning. If 0, skip the tuning and fit the model on its default parameters. If sequence, the n-th value applies to the n-th model. <p>est_params: dict or None, default=None Additional parameters for the models. See their corresponding documentation for the available options. For multiple models, use the acronyms as key (or 'all' for all models) and a dict of the parameters as value. Add <code>_fit</code> to the parameter's name to pass it to the estimator's fit method instead of the constructor. <p>ht_params: dict or None, default=None Additional parameters for the hyperparameter tuning. If None, it uses the same parameters as the first run. Can include: <ul> <li>cv: int, cv-generator, dict or sequence, default=1   Cross-validation object or number of splits. If 1, the   data is randomly split in a subtrain and validation set.</li> <li>plot: bool, dict or sequence, default=False   Whether to plot the optimization's progress as it runs.   Creates a canvas with two plots: the first plot shows the   score of every trial and the second shows the distance between   the last consecutive steps. See the plot_trials method.</li> <li>distributions: dict, sequence or None, default=None   Custom hyperparameter distributions. If None, it uses the   model's predefined distributions. Read more in the   user guide.</li> <li>tags: dict, sequence or None, default=None   Custom tags for the model's trial and mlflow run.</li> <li>**kwargs   Additional Keyword arguments for the constructor of the   study class or the optimize method.</li> </ul> <p>n_bootstrap: int or sequence, default=0 Number of data sets to use for bootstrapping. If 0, no bootstrapping is performed. If sequence, the n-th value applies to the n-th model. <p>parallel: bool, default=False Whether to train the models in a parallel or sequential fashion. Using <code>parallel=True</code> turns off the verbosity of the models during training. Note that many models also have build-in parallelizations (often when the estimator has the <code>n_jobs</code> parameter). <p>errors: str, default=\"skip\" How to handle exceptions encountered during model training. Choose from: <ul> <li>\"raise\": Raise any encountered exception.</li> <li>\"skip\": Skip a failed model. This model is not accessible   after training.</li> <li>\"keep\": Keep the model in its state at failure. Note that   this model can break down many other methods after training.   This option is useful to be able to rerun hyperparameter   optimization after failure without losing previous successful   trials.</li> </ul> <p>n_jobs: int, default=1 Number of cores to use for parallel processing. <ul> <li>If &gt;0: Number of cores to use.</li> <li>If -1: Use all available cores.</li> <li>If &lt;-1: Use number of cores - 1 + <code>n_jobs</code>.</li> </ul> <p>device: str, default=\"cpu\" Device on which to run the estimators. Use any string that follows the SYCL_DEVICE_FILTER filter selector, e.g. <code>device=\"gpu\"</code> to use the GPU. Read more in the user guide. <p>engine: dict, default={\"data\": \"numpy\", \"estimator\": \"sklearn\"} Execution engine to use for data and estimators. The value should be a dictionary with keys <code>data</code> and/or <code>estimator</code>, with their corresponding choice as values. Choose from: <ul> <li> <p>\"data\":</p> <ul> <li>\"numpy\"</li> <li>\"pyarrow\"</li> <li>\"modin\"</li> </ul> </li> <li> <p>\"estimator\":</p> <ul> <li>\"sklearn\"</li> <li>\"sklearnex\"</li> <li>\"cuml\"</li> </ul> </li> </ul> <p>backend: str, default=\"loky\" Parallelization backend. Read more in the user guide. Choose from: <ul> <li>\"loky\": Single-node, process-based parallelism.</li> <li>\"multiprocessing\": Legacy single-node, process-based   parallelism. Less robust than <code>loky</code>.</li> <li>\"threading\": Single-node, thread-based parallelism.</li> <li>\"ray\": Multi-node, process-based parallelism.</li> </ul> <p>memory: bool, str, Path or Memory, default=False Enables caching for memory optimization. Read more in the user guide. <ul> <li>If False: No caching is performed.</li> <li>If True: A default temp directory is used.</li> <li>If str: Path to the caching directory.</li> <li>If Path: A pathlib.Path to the caching directory.</li> <li>If Memory: Object with the joblib.Memory interface.</li> </ul> <p>verbose: int, default=0 Verbosity level of the class. Choose from: <ul> <li>0 to not print anything.</li> <li>1 to print basic information.</li> <li>2 to print detailed information.</li> </ul> <p>warnings: bool or str, default=False <ul> <li>If True: Default warning action (equal to \"once\").</li> <li>If False: Suppress all warnings (equal to \"ignore\").</li> <li>If str: One of python's warnings filters.</li> </ul> <p>Changing this parameter affects the <code>PYTHONWarnings</code> environment. ATOM can't manage warnings that go from C/C++ code to stdout.</p> <p>logger: str, Logger or None, default=None <ul> <li>If None: Logging isn't used.</li> <li>If str: Name of the log file. Use \"auto\" for automatic name.</li> <li>If Path: A pathlib.Path to the log file.</li> <li>Else: Python <code>logging.Logger</code> instance.</li> </ul> <p>experiment: str or None, default=None Name of the mlflow experiment to use for tracking. If None, no mlflow tracking is performed. <p>random_state: int or None, default=None Seed used by the random number generator. If None, the random number generator is the <code>RandomState</code> used by <code>np.random</code>. <p></p> <p></p> <p>See Also</p> <p>ATOMRegressor Main class for regression tasks.</p> <p>DirectRegressor Train and evaluate the models in a direct fashion.</p> <p>SuccessiveHalvingRegressor Train and evaluate the models in a successive halving fashion.</p> <p></p>"}, {"location": "API/training/trainsizingregressor/#example", "title": "Example", "text": "<pre><code>&gt;&gt;&gt; from atom.training import TrainSizingRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_digits\n&gt;&gt;&gt; from sklearn.model_selection import train_test_split\n\n&gt;&gt;&gt; X, y = load_digits(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; train, test = train_test_split(\n...     X.merge(y.to_frame(), left_index=True, right_index=True),\n...     test_size=0.3,\n... )\n\n&gt;&gt;&gt; runner = TrainSizingRegressor(models=\"OLS\", verbose=2)\n&gt;&gt;&gt; runner.run(train, test)\n\n\nTraining ========================= &gt;&gt;\nMetric: r2\n\n\nRun: 0 =========================== &gt;&gt;\nModels: OLS02\nSize of training set: 251 (20%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6391\nTest evaluation --&gt; r2: -4.630208907041091e+25\nTime elapsed: 0.148s\n-------------------------------------------------\nTime: 0.148s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.149s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: -4.630208907041091e+25 ~\n\n\nRun: 1 =========================== &gt;&gt;\nModels: OLS04\nSize of training set: 502 (40%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6137\nTest evaluation --&gt; r2: -9.496101715653298e+22\nTime elapsed: 0.150s\n-------------------------------------------------\nTime: 0.150s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.151s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: -9.496101715653298e+22 ~\n\n\nRun: 2 =========================== &gt;&gt;\nModels: OLS06\nSize of training set: 754 (60%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.6086\nTest evaluation --&gt; r2: -0.2872\nTime elapsed: 0.151s\n-------------------------------------------------\nTime: 0.151s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.152s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: -0.2872 ~\n\n\nRun: 3 =========================== &gt;&gt;\nModels: OLS08\nSize of training set: 1005 (80%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.5986\nTest evaluation --&gt; r2: 0.5025\nTime elapsed: 0.150s\n-------------------------------------------------\nTime: 0.150s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.152s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: 0.5025\n\n\nRun: 4 =========================== &gt;&gt;\nModels: OLS10\nSize of training set: 1257 (100%)\nSize of test set: 540\n\n\nResults for OrdinaryLeastSquares:\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.5951\nTest evaluation --&gt; r2: 0.5864\nTime elapsed: 0.150s\n-------------------------------------------------\nTime: 0.150s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.151s\n-------------------------------------\nOrdinaryLeastSquares --&gt; r2: 0.5864\n\n\n&gt;&gt;&gt; # Analyze the results\n&gt;&gt;&gt; print(runner.results)\n\n            r2_train       r2_test  time_fit      time\nfrac model                                            \n0.2  OLS02    0.6391 -4.630209e+25  0.148360  0.148360\n0.4  OLS04    0.6137 -9.496102e+22  0.149996  0.149996\n0.6  OLS06    0.6086 -2.872000e-01  0.151353  0.151353\n0.8  OLS08    0.5986  5.025000e-01  0.149508  0.149508\n1.0  OLS10    0.5951  5.864000e-01  0.149549  0.149549\n\n\n&gt;&gt;&gt; print(runner.evaluate())\n\n                mae          mape           mse            r2          rmse\nOLS02 -1.004380e+12 -7.646687e+14 -3.774343e+26 -4.630209e+25 -1.942767e+13\nOLS04 -5.120843e+10 -8.663629e+14 -7.740805e+23 -9.496102e+22 -8.798184e+11\nOLS06 -1.559600e+00 -7.836450e+14 -1.049240e+01 -2.872000e-01 -3.239200e+00\nOLS08 -1.482200e+00 -8.382465e+14 -4.055100e+00  5.025000e-01 -2.013700e+00\nOLS10 -1.445900e+00 -8.224099e+14 -3.371700e+00  5.864000e-01 -1.836200e+00\n</code></pre>"}, {"location": "API/training/trainsizingregressor/#attributes", "title": "Attributes", "text": ""}, {"location": "API/training/trainsizingregressor/#data-attributes", "title": "Data attributes", "text": "<p>The data attributes are used to access the dataset and its properties. Updating the dataset will automatically update the response of these attributes accordingly.</p> <p>Attributesdataset: DataFrame | modin.pandas.dataframe.DataFrameComplete data set. train: DataFrame | modin.pandas.dataframe.DataFrameTraining set. test: DataFrame | modin.pandas.dataframe.DataFrameTest set. X: DataFrame | modin.pandas.dataframe.DataFrameFeature set. y: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s). holdout: DataFrame | NoneHoldout set. <p>This data set is untransformed by the pipeline. Read more in the user guide. X_train: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the training set. y_train: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the training set. X_test: DataFrame | modin.pandas.dataframe.DataFrameFeatures of the test set. y_test: Series | modin.pandas.series.Series | DataFrame | modin.pandas.dataframe.DataFrameTarget column(s) of the test set. shape: tuple[int | numpy.integer, int | numpy.integer]Shape of the dataset (n_rows, n_columns). columns: IndexName of all the columns. n_columns: int | numpy.integerNumber of columns. features: IndexName of the features. n_features: int | numpy.integerNumber of features. target: str | list[str]Name of the target column(s). </p> <p></p>"}, {"location": "API/training/trainsizingregressor/#utility-attributes", "title": "Utility attributes", "text": "<p>The utility attributes are used to access information about the models in the instance after training.</p> <p>Attributesmodels: str | list[str] | NoneName of the model(s). metric: str | list[str] | NoneName of the metric(s). winners: list[model] | NoneModels ordered by performance. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. winner: model | NoneBest performing model. <p>Performance is measured as the highest score on the model's <code>[main_metric]_bootstrap</code> or <code>[main_metric]_test</code>, checked in that order. Ties are resolved looking at the lowest <code>time_fit</code>. results: pd.DataFrameOverview of the training results. <p>All durations are in seconds. Possible values include:</p> <ul> <li>[metric]_ht: Score obtained by the hyperparameter tuning.</li> <li>time_ht: Duration of the hyperparameter tuning.</li> <li>[metric]_train: Metric score on the train set.</li> <li>[metric]_test: Metric score on the test set.</li> <li>time_fit: Duration of the model fitting on the train set.</li> <li>[metric]_bootstrap: Mean score on the bootstrapped samples.</li> <li>time_bootstrap: Duration of the bootstrapping.</li> <li>time: Total duration of the run. </li> </ul> <p></p>"}, {"location": "API/training/trainsizingregressor/#tracking-attributes", "title": "Tracking attributes", "text": "<p>The tracking attributes are used to customize what elements of the experiment are tracked. Read more in the user guide.</p> <p>Attributeslog_ht: boolWhether to track every trial of the hyperparameter tuning. log_plots: boolWhether to save plots as artifacts. log_data: boolWhether to save the train and test sets. log_pipeline: boolWhether to save the model's pipeline. </p> <p></p>"}, {"location": "API/training/trainsizingregressor/#plot-attributes", "title": "Plot attributes", "text": "<p>The plot attributes are used to customize the plot's aesthetics. Read more in the user guide.</p> <p>Attributespalette: str | Sequence[str]Color palette. <p>Specify one of plotly's built-in palettes or create a custom one, e.g., <code>atom.palette = [\"red\", \"green\", \"blue\"]</code>. title_fontsize: int | floatFontsize for the plot's title. label_fontsize: int | floatFontsize for the labels, legend and hover information. tick_fontsize: int | floatFontsize for the ticks along the plot's axes. line_width: int | floatWidth of the line plots. marker_size: int | floatSize of the markers. </p> <p></p>"}, {"location": "API/training/trainsizingregressor/#methods", "title": "Methods", "text": "<p>Next to the plotting methods, the class contains a variety of methods to handle the data, run the training, and manage the pipeline.</p> <p>available_modelsGive an overview of the available predefined models.canvasCreate a figure with multiple plots.clearReset attributes and clear cache from all models.deleteDelete models.evaluateGet all models' scores for the provided metrics.export_pipelineExport the internal pipeline.get_class_weightReturn class weights for a balanced data set.get_paramsGet parameters for this estimator.mergeMerge another instance of the same class into this one.update_layoutUpdate the properties of the plot's layout.update_tracesUpdate the properties of the plot's traces.reset_aestheticsReset the plot aesthetics to their default values.runTrain and evaluate the models.saveSave the instance to a pickle file.set_paramsSet the parameters of this estimator.stackingAdd a Stacking model to the pipeline.votingAdd a Voting model to the pipeline.</p> <p></p> <p>method available_models()[source]Give an overview of the available predefined models.</p> <p>Returnspd.DataFrame Information about the available predefined models. Columns include: <ul> <li>acronym: Model's acronym (used to call the model).</li> <li>model: Name of the model's class.</li> <li>estimator: The model's underlying estimator.</li> <li>module: The estimator's module.</li> <li>needs_scaling: Whether the model requires feature scaling.</li> <li>accepts_sparse: Whether the model accepts sparse matrices.</li> <li>native_multilabel: Whether the model has native support   for multilabel tasks.</li> <li>native_multioutput: Whether the model has native support   for multioutput tasks.</li> <li>has_validation: Whether the model has in-training validation.</li> <li>supports_engines: Engines supported by the model. </li> </ul> <p></p> <p>method canvas(rows=1, cols=2, horizontal_spacing=0.05, vertical_spacing=0.07, title=None, legend=\"out\", figsize=None, filename=None, display=True)[source]Create a figure with multiple plots.</p> <p>This <code>@contextmanager</code> allows you to draw many plots in one figure. The default option is to add two plots side by side. See the user guide for an example.</p> <p>Parametersrows: int, default=1 Number of plots in length. <p>cols: int, default=2 Number of plots in width. <p>horizontal_spacing: float, default=0.05 Space between subplot rows in normalized plot coordinates. The spacing is relative to the figure's size. <p>vertical_spacing: float, default=0.07 Space between subplot cols in normalized plot coordinates. The spacing is relative to the figure's size. <p>title: str, dict or None, default=None Title for the plot. <ul> <li>If None, no title is shown.</li> <li>If str, text for the title.</li> <li>If dict, title configuration.</li> </ul> <p>legend: bool, str or dict, default=\"out\" Legend for the plot. See the user guide for an extended description of the choices. <ul> <li>If None: No legend is shown.</li> <li>If str: Location where to show the legend.</li> <li>If dict: Legend configuration.</li> </ul> <p>figsize: tuple or None, default=None Figure's size in pixels, format as (x, y). If None, it adapts the size to the number of plots in the canvas. <p>filename: str, Path or None, default=None Save the plot using this name. Use \"auto\" for automatic naming. The type of the file depends on the provided name (.html, .png, .pdf, etc...). If <code>filename</code> has no file type, the plot is saved as html. If None, the plot is not saved. <p>display: bool, default=True Whether to render the plot. <p>Yieldsgo.Figure Plot object. </p> <p></p> <p>method clear()[source]Reset attributes and clear cache from all models.</p> <p>Reset certain model attributes to their initial state, deleting potentially large data arrays. Use this method to free some memory before saving the instance. The affected attributes are:</p> <ul> <li>In-training validation scores</li> <li>Shap values</li> <li>App instance</li> <li>Dashboard instance</li> <li>Calculated holdout data sets</li> </ul> <p></p> <p>method delete(models=None)[source]Delete models.</p> <p>If all models are removed, the metric is reset. Use this method to drop unwanted models from the pipeline or to free some memory before saving. Deleted models are not removed from any active mlflow experiment.</p> <p>Parametersmodels: int, str, Model, segment, sequence or None, default=None Models to delete. If None, all models are deleted. </p> <p></p> <p>method evaluate(metric=None, rows=\"test\", threshold=0.5, sample_weight=None)[source]Get all models' scores for the provided metrics.</p> <p>Parametersmetric: str, func, scorer, sequence or None, default=None Metric to calculate. If None, it returns an overview of the most common metrics per task. <p>rows: hashable, segment, sequence or dataframe, default=\"test\" Selection of rows to calculate metric on. <p>threshold: float or sequence, default=0.5 Threshold between 0 and 1 to convert predicted probabilities to class labels. Only used when: <ul> <li>The task is binary or multilabel classification.</li> <li>The model has a <code>predict_proba</code> method.</li> <li>The metric evaluates predicted probabilities.</li> </ul> <p>For multilabel classification tasks, it's possible to provide a sequence of thresholds (one per target column). The same threshold per target column is applied to all models.</p> <p>sample_weight: sequence or None, default=None Sample weights corresponding to y in <code>dataset</code>. <p>Returnspd.DataFrame Scores of the models. </p> <p></p> <p>method export_pipeline(model=None)[source]Export the internal pipeline.</p> <p>This method returns a deepcopy of the branch's pipeline. Optionally, you can add a model as final estimator. The returned pipeline is already fitted on the training set.</p> <p>Parametersmodel: str, Model or None, default=None Model for which to export the pipeline. If the model used automated feature scaling, the Scaler is added to the pipeline. If None, the pipeline in the current branch is exported. <p>ReturnsPipeline Current branch as a sklearn-like Pipeline object. </p> <p></p> <p>method get_class_weight(rows=\"train\")[source]Return class weights for a balanced data set.</p> <p>Statistically, the class weights re-balance the data set so that the sampled data set represents the target population as closely as possible. The returned weights are inversely proportional to the class frequencies in the selected rows.</p> <p>Parametersrows: hashable, segment, sequence or dataframe, default=\"train\" Selection of rows for which to get the weights. <p>Returnsdict Classes with the corresponding weights. A dict of dicts is returned for multioutput tasks. </p> <p></p> <p>method get_params(deep=True)[source]Get parameters for this estimator.</p> <p>Parametersdeep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. <p>Returnsparams : dict Parameter names mapped to their values. </p> <p></p> <p>method merge(other, suffix=\"2\")[source]Merge another instance of the same class into this one.</p> <p>Branches, models, metrics and attributes of the other instance are merged into this one. If there are branches and/or models with the same name, they are merged adding the <code>suffix</code> parameter to their name. The errors and missing attributes are extended with those of the other instance. It's only possible to merge two instances if they are initialized with the same dataset and trained with the same metric.</p> <p>Parametersother: Runner Instance with which to merge. Should be of the same class as self. <p>suffix: str, default=\"2\" Branches and models with conflicting names are merged adding <code>suffix</code> to the end of their names. </p> <p></p> <p>method update_layout(**kwargs)[source]Update the properties of the plot's layout.</p> <p>Recursively update the structure of the original layout with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_layout method. </p> <p></p> <p>method update_traces(**kwargs)[source]Update the properties of the plot's traces.</p> <p>Recursively update the structure of the original traces with the values in the arguments.</p> <p>Parameters**kwargs Keyword arguments for the figure's update_traces method. </p> <p></p> <p>method reset_aesthetics()[source]Reset the plot aesthetics to their default values.</p> <p></p> <p>method run(*arrays)[source]Train and evaluate the models.</p> <p>Read more in the user guide.</p> <p>Parameters*arrays: sequence of indexables Training set and test set. Allowed formats are: <ul> <li>train, test</li> <li>X_train, X_test, y_train, y_test</li> <li>(X_train, y_train), (X_test, y_test) </li> </ul> <p></p> <p>method save(filename=\"auto\", save_data=True)[source]Save the instance to a pickle file.</p> <p>Parametersfilename: str or Path, default=\"auto\" Filename or pathlib.Path of the file to save. Use \"auto\" for automatic naming. <p>save_data: bool, default=True Whether to save the dataset with the instance. This parameter is ignored if the method is not called from atom. If False, add the data to the load method to reload the instance. </p> <p></p> <p>method set_params(**params)[source]Set the parameters of this estimator.</p> <p>Parameters**params : dict Estimator parameters. <p>Returnsself : estimator instance Estimator instance. </p> <p></p> <p>method stacking(models=None, name=\"Stack\", **kwargs)[source]Add a Stacking model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Stack\" Name of the model. The name is always presided with the model's acronym: <code>Stack</code>. <p>**kwargs Additional keyword arguments for sklearn's stacking instance. The model's acronyms can be used for the <code>final_estimator</code> parameter. </p> <p></p> <p>method voting(models=None, name=\"Vote\", **kwargs)[source]Add a Voting model to the pipeline.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p> <p>Parametersmodels: segment, sequence or None, default=None Models that feed the stacking estimator. The models must have been fitted on the current branch. <p>name: str, default=\"Vote\" Name of the model. The name is always presided with the model's acronym: <code>Vote</code>. <p>**kwargs Additional keyword arguments for sklearn's voting instance. </p> <p></p>"}, {"location": "changelog/v4.x.x/", "title": "Release history", "text": ""}, {"location": "changelog/v4.x.x/#version-4141", "title": "Version 4.14.1", "text": "<ul> <li>Fixed an installation issue with <code>conda</code>.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-4140", "title": "Version 4.14.0", "text": "<ul> <li>Refactor of the Cleaner and Vectorizer classes.</li> <li>Refactor of the cross_validate method.</li> <li>The plot_pipeline method now supports drawing multiple pipelines.</li> <li>Renamed the <code>Normalizer</code> class to <code>TextNormalizer</code>.</li> <li>Renamed the <code>Gauss</code> class to <code>Normalizer</code>.</li> <li>Added the <code>inverse_transform</code> method to the Scaler, Normalizer   and Cleaner classes.</li> <li>Added the <code>winners</code> property to the trainers (note the extra <code>s</code>). </li> <li>Added the <code>feature_names_in_</code> and <code>n_features_in_</code> attributes to transformers.</li> <li>The default value of the <code>warnings</code> parameter is set to False.</li> <li>Improvements for multicollinearity removal in FeatureSelector.</li> <li>Renamed default feature names to <code>x0</code>, <code>x1</code>, etc... for consistency with   sklearn's API.</li> <li>Renamed component names in FeatureSelector   to <code>pca0</code>, <code>pca1</code>, etc... for consistency with sklearn's API.</li> <li>Significant speed up in pipeline transformations.</li> <li>Fixed a bug where mlflow runs could be ended unexpectedly.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-4131", "title": "Version 4.13.1", "text": "<ul> <li>Fixed an installation issue.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-4130", "title": "Version 4.13.0", "text": "<ul> <li>Added GPU support. Read more in the user guide.</li> <li>Added advanced feature selection strategies.</li> <li>Added the <code>return_sparse</code> parameter to the Vectorizer class.</li> <li>Added the <code>quantile</code> hyperparameter to the Dummy model.</li> <li>The data attributes now return pandas objects where possible.</li> <li>Fixed a bug where the BO could crash after balancing   the data.</li> <li>Fixed a bug where saving the FeatureGenerator   class could fail for certain operators.</li> <li>Fixed a bug where the FeatureSelector   class displayed the wrong output.</li> <li>Fixed a bug where the <code>mapping</code> attribute was not reordered.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-4120", "title": "Version 4.12.0", "text": "<ul> <li>Support for Python 3.10.</li> <li>New Discretizer class to bin numerical features.</li> <li>Refactor of the FeatureGenerator class.</li> <li>The <code>mapping</code> attribute now shows all encoded features.</li> <li>Added the <code>sample_weight</code> parameter to the evaluate method.</li> <li>ATOMClassifier has now a <code>stratify</code> parameter   to split the data sets in a stratified fashion.</li> <li>Possibility to exclude hyperparameters from the BO adding <code>!</code> before the name.</li> <li>Added memory usage to the stats method.</li> <li>Fixed a bug where plot_shap_decision could fail when only one row was plotted.</li> <li>Added versioning to the documentation.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-4110", "title": "Version 4.11.0", "text": "<ul> <li>Full support for sparse matrices. Read more in the user guide.</li> <li>The shrink method now also handles sparse features.</li> <li>Refactor of the distribution method.</li> <li>Added three new linear models: Lars, Huber and Perc.</li> <li>Dimensions can be shared across models using the key 'all' in <code>ht_params[\"dimensions\"]</code>.</li> <li>Assign hyperparameters to tune using the predefined dimensions.</li> <li>It's now possible to tune a custom number of layers for the MLP   model.</li> <li>If multiple BO calls share the best score, the one with the shortest   training time is selected as winner (instead of the first).</li> <li>Fixed a bug where the BO could fail when custom dimensions where defined.</li> <li>Fixed a bug where FeatureSelector   could fail after repeated calls to fit.</li> <li>Fixed a bug where FeatureGenerator   didn't pass the correct data indices to its output.</li> <li>Performance improvements for the custom pipeline.</li> <li>Minor documentation fixes.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-4100", "title": "Version 4.10.0", "text": "<ul> <li>Added the <code>holdout</code> data set to have an extra way of assessing a   model's performance on a completely independent dataset. Read more   in the user_guide.</li> <li>Complete rework of the ensemble models.</li> <li>Support for dataframe indexing. Read more in the user guide.</li> <li>New plot_parshap plot to detect overfitting   features.</li> <li>The new create_dashboard method makes analyzing   the models even easier using a dashboard app.</li> <li>The plot_feature_importance   plot now also accepts estimators with coefficients.</li> <li>Added the transform method for models.</li> <li>Added the <code>threshold</code> parameter to the evaluate method.</li> <li>The <code>reset_predictions</code> method is deprecated in favour of the new   clear method.</li> <li>Refactor of the model's full_train method.</li> <li>The merge method is available for all trainers.</li> <li>Improvements in the trainer's pipeline.</li> <li>Training scores are now also saved to the mlflow run.</li> <li>Trying to change the data in a branch after fitting a model with it now   raises an exception.</li> <li>Fixed a bug where the columns of array inputs were not ordered correctly.</li> <li>Fixed a bug where branches did not correctly act case-insensitive.</li> <li>Fixed a bug where the export_pipeline   method for models would not export the transformers in the correct branch.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-491", "title": "Version 4.9.1", "text": "<ul> <li>Changed the default cross-validation for hyperparameter tuning   from 5 to 1 to avoid errors with deep learning models.</li> <li>Added clearer exception messages when a model's run failed.</li> <li>Fixed a bug where custom dimensions didn't show during   hyperparameter tuning.</li> <li>Documentation improvements.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-490", "title": "Version 4.9.0", "text": "<ul> <li>Drop support of Python 3.6.</li> <li>Added the HistGBM model.</li> <li>Improved print layout for hyperparameter tuning.</li> <li>The new available_models method returns an overview of   the available predefined models.</li> <li>The calibrate and cross_validate   methods can no longer be accessed from the trainers.</li> <li>The <code>pipeline</code> parameter for the prediction methods is deprecated.</li> <li>Improved visualization of the plot_rfecv, plot_successive_halving and    plot_learning_curve methods.</li> <li>Sparse matrices are now accepted as input.</li> <li>Duplicate BO calls are no longer calculated.</li> <li>Improvement in performance of the RNN model.</li> <li>Refactor of the model's <code>bo</code> attribute.</li> <li>Predefined hyperparameters have been updated to be consistent with sklearn's API.</li> <li>Fixed a bug where custom scalers were ignored by the models.</li> <li>Fixed a bug where the BO of certain models would crash with custom hyperparameters.</li> <li>Fixed a bug where duplicate column names could be generated from a custom transformer.</li> <li>Documentation improvements.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-480", "title": "Version 4.8.0", "text": "<ul> <li>The Encoder class now directly handles   unknown categories encountered during fitting.</li> <li>The Balancerand Encoder classes now accept custom estimators for   the <code>strategy</code> parameter.</li> <li>The new merge method enables the user to merge   multiple atom instances into one.</li> <li>The dtype shrinking is moved from atom's initializers to the   shrink method.</li> <li>ATOM's custom pipeline now handles transformers fitted on a   subset of the dataset.</li> <li>The <code>column</code> parameter in the distribution   method is renamed to <code>columns</code> for continuity of the API.</li> <li>The <code>mae</code> criterion for the GBM model hyperparameter tuning is deprecated   to be consistent with sklearn's API.</li> <li>Branches are now case-insensitive.</li> <li>Renaming a branch using an existing name now raises an exception.</li> <li>Fixed a bug where columns of type <code>category</code> broke the Imputer class.</li> <li>Fixed a bug where predictions of the Stacking ensemble crashed for   branches with multiple transformers.</li> <li>The tables in the documentation now adapt to dark mode.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-473", "title": "Version 4.7.3", "text": "<ul> <li>Fixed a bug where the conda-forge recipe couldn't install properly.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-472", "title": "Version 4.7.2", "text": "<ul> <li>Fixed a bug where the pipeline failed for custom transformers that   returned sparse matrices.</li> <li>Package requirements files are added to the installer.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-471", "title": "Version 4.7.1", "text": "<ul> <li>Fixed a bug where the pip installer failed.</li> <li>Fixed a bug where categorical columns also selected datetime columns.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-470", "title": "Version 4.7.0", "text": "<ul> <li>Launched our new slack channel!</li> <li>The new FeatureExtractor class extracts useful features from datetime columns.</li> <li>The new plot_det method plots a binary classifier's detection error tradeoff curve. </li> <li>The plot_partial_dependence is able to draw Individual Conditional Expectation (ICE) lines.</li> <li>The full traceback of exceptions encountered during training are now   saved to the logger.</li> <li>ATOMClassifier and ATOMRegressor now convert the dtypes of the input   data to the minimal allowed type for memory efficiency.</li> <li>The scoring method is renamed to evaluate to clarify its purpose.</li> <li>The <code>column</code> parameter in the apply method   is renamed to <code>columns</code> for continuity of the API.</li> <li>Minor documentation improvements.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-460", "title": "Version 4.6.0", "text": "<ul> <li>Added the full_train method to retrieve an estimator trained   on the complete dataset.</li> <li>The score method is now also able to calculate custom metrics on new data.</li> <li>Refactor of the Imputer class. </li> <li>Refactor of the Encoder class to avoid errors for unknown classes and allow   the input of missing values.</li> <li>The clean method no longer automatically   encodes the target column for regression tasks.</li> <li>Creating a branch using a models' acronym as name now raises an exception.</li> <li>Fixed a bug where CatBoost failed when <code>early_stopping</code> &lt; 1.</li> <li>Fixed a bug where created pipelines had duplicated names.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-450", "title": "Version 4.5.0", "text": "<ul> <li>Support of NLP pipelines. Read more in the user guide.</li> <li>Integration of mlflow to track all models in the   pipeline. Read more in the user guide.</li> <li>The new Normalizer class transforms features to a more Gaussian-like distribution.</li> <li>New cross_validate method to evaluate the robustness   of a pipeline using cross_validation.</li> <li>New reset method to go back to atom's initial state.</li> <li>Added the Dummy model to compare other models with a simple baseline.</li> <li>New plot_wordcloud and plot_ngrams methods for text visualization.</li> <li>Plots now can return the figure object when <code>display=None</code>.</li> <li>The Pruner class can now able to drop outliers   based on the selection of multiple strategies.</li> <li>The new <code>shuffle</code> parameter in atom's initializer determines whether to   shuffle the dataset.</li> <li>The trainers no longer require you to specify a model using the <code>models</code>   parameter. If left to default, all predefined models for that task are used.</li> <li>The apply method now accepts args and kwargs for the function.</li> <li>Refactor of the evaluate method.</li> <li>Refactor of the export_pipeline method.</li> <li>The parameters in the Cleaner class have been refactored to better describe   their function.</li> <li>The <code>train_sizes</code> parameter in train_sizing now accepts integer   values to automatically create equally distributed splits in the training set.</li> <li>Refactor of plot_pipeline to show models in the diagram as well.</li> <li>Refactor of the <code>bagging</code> parameter to the (more appropriate) name <code>n_bootstrap</code>.</li> <li>New option to exclude columns from a transformer adding <code>!</code> before their name.</li> <li>Fixed a bug where the Pruner class failed if there were categorical columns   in the dataset.</li> <li>Completely reworked documentation website.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-440", "title": "Version 4.4.0", "text": "<ul> <li>New apply method to perform data transformations   as function to the pipeline</li> <li>Added the status method to save an overview of   atom's branches and models to the logger.</li> <li>Improved the output messages for the Imputer class.</li> <li>The dataset's columns can now be called directly from atom.</li> <li>The distribution and plot_distribution   methods now ignore missing values.</li> <li>Fixed a bug where transformations could fail when columns were added to the   dataset after initializing the pipeline.</li> <li>Fixed a bug where the Cleaner class didn't drop   columns consisting entirely of missing values when <code>drop_min_cardinality=True</code>.</li> <li>Fixed a bug where the winning model wasn't displayed correctly.</li> <li>Refactored the way transformers are added or removed from predicting methods.</li> <li>Improved documentation.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-430", "title": "Version 4.3.0", "text": "<ul> <li>Possibility to add custom transformers to the pipeline.</li> <li>The export_pipeline utility method exports   atom's current pipeline to a sklearn object.</li> <li>New magic methods makes atom behave similarly to sklearn's Pipeline.</li> <li>All training approaches can now be combined in the same atom instance.</li> <li>New plot_relationships, plot_distribution and plot_qq plots for data inspection.</li> <li>Complete rework of all the shap plots to be consistent with their new API.</li> <li>Improvements for the Scaler and [Pruner]([] classes.</li> <li>The acronym for custom models now defaults to the capital letters in the class' __name__.</li> <li>Possibility to apply transformations on only a subset of the columns.</li> <li>Plots and methods now accept <code>winner</code> as model name.</li> <li>Fixed a bug where custom metrics didn't show the correct name.</li> <li>Fixed a bug where timers were not displayed correctly.</li> <li>Further compatibility with deep learning datasets.</li> <li>Large refactoring for performance optimization.</li> <li>Cleaner output of messages to the logger.</li> <li>Plots no longer show a default title.</li> <li>Minor bug fixes.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-421", "title": "Version 4.2.1", "text": "<ul> <li>Bug fix where there was memory leakage in successive halving   and train sizing pipelines.</li> <li>Improved documentation.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-420", "title": "Version 4.2.0", "text": "<ul> <li>Possibility to add custom models to the pipeline using ATOMModel.</li> <li>Compatibility with deep learning models.</li> <li>New branch system for different data pipelines. Read more in the user guide.</li> <li>Use the canvas contextmanager to draw multiple plots in one figure.</li> <li>New voting and stacking ensemble techniques.</li> <li>New get_class_weight utility method.</li> <li>New Sequential Feature Selection strategy for the FeatureSelector.</li> <li>Added the <code>sample_weight</code> parameter to the score method.</li> <li>New ways to initialize the data in the <code>training</code> instances.</li> <li>The <code>test_size</code> parameter now also allows integer values.</li> <li>Renamed categories to classes to be consistent with sklearn's API.</li> <li>The class property now returns a pd.DataFrame of the number of rows per target class   in the train, test and complete dataset.</li> <li>Possibility to add custom parameters to an estimator's fit method through <code>est_params</code>.</li> <li>The successive halving and train sizing approaches now both allow subsequent   runs from atom without losing the information from previous runs.</li> <li>Bug fix where ATOMLoader wouldn't encode the target column during transformation.</li> <li>Added the Deep learning, Ensembles   and Utilities example notebooks.</li> <li>Support for python 3.9.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-410", "title": "Version 4.1.0", "text": "<ul> <li>New <code>est_params</code> parameter to customize the parameters in every model's estimator.</li> <li>Following skopt's API, the <code>n_random_starts</code> parameter to specify the number   of random trials is deprecated in favour of <code>n_initial_points</code>.</li> <li>The Balancer class now allows you to use any of the   strategies from imblearn.</li> <li>New utility attributes to inspect the dataset.</li> <li>Four new models: CatNB, CNB, ARD and RNN.</li> <li>Added the models section to the documentation.</li> <li>Small changes in log outputs.</li> <li>Bug fixes and performance improvements.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-401", "title": "Version 4.0.1", "text": "<ul> <li>Bug fix where the FeatureGenerator was not deterministic for a fixed random state.</li> <li>Bug fix where subsequent runs with the same metric failed.</li> <li>Added the license file to the package's installer.</li> <li>Typo fixes in documentation.</li> </ul>"}, {"location": "changelog/v4.x.x/#version-400", "title": "Version 4.0.0", "text": "<ul> <li>Bayesian optimization package changed from GpyOpt   to skopt.</li> <li>Complete revision of the model's hyperparameters.</li> <li>Four SHAP plots can now be called directly from an ATOM pipeline.</li> <li>Two new plots for regression tasks.</li> <li>New plot_pipeline and <code>pipeline</code> attribute to access all transformers. </li> <li>Possibility to determine transformer parameters per method.</li> <li>New calibrate and plot_calibration methods.</li> <li>Metrics can now be added as scorers or functions with signature metric(y, y_pred, **kwargs).</li> <li>Implementation of multi-metric runs.</li> <li>Possibility to choose which metric to plot.</li> <li>Early stopping for models that allow in-training validation.</li> <li>Added the <code>ATOMLoader</code> function to load any saved pickle instance.</li> <li>The \"remove\" strategy in the data cleaning parameters is deprecated in favour of \"drop\".</li> <li>Implemented the dfs strategy in FeatureGenerator.</li> <li>All training classes now inherit from BaseEstimator.</li> <li>Added multiple new example notebooks.</li> <li>Tests coverage up to 100%.</li> <li>Completely new documentation page.</li> <li>Bug fixes and performance improvements.</li> </ul>"}, {"location": "changelog/v5.x.x/", "title": "Release history", "text": ""}, {"location": "changelog/v5.x.x/#version-600", "title": "Version 6.0.0", "text": "<p> New features</p> <ul> <li>Completely new module for time series. Read more in the user guide.</li> <li>Support for Python 3.11 and drop support for Python 3.8   and Python 3.9.</li> <li>New data engines. Read more in the user guide.</li> <li>Improved memory optimizations. Read more in the user guide.</li> <li>Added the <code>iterative</code> strategy for numerical imputation.</li> <li>New update_traces method to further customize your plots.</li> </ul> <p> API changes</p> <ul> <li>The FeatureGrouper class no longer accepts a <code>name</code> parameter. Provide   the group names directly through the <code>group</code> parameter as dict.</li> <li>Rework of the register method.</li> <li>The <code>multioutput</code> attribute is deprecated. Multioutput meta-estimators are   now assigned automatically.</li> <li>Model tags have to be separated from the acronym by an underscore.</li> <li>The <code>engine</code> parameter is now a dict.</li> <li>The <code>automl</code> method is deprecated.</li> </ul> <p> Enhancements</p> <ul> <li>Transformations only on <code>y</code> are now accepted, e.g., <code>atom.scale(columns=-1)</code>.</li> <li>Full support for pandas nullable dtypes.</li> <li>The dataset can now be provided as callable.</li> <li>The save and save_data   methods now accept pathlib.Path objects as <code>filename</code>.</li> <li>Cleaner representation on hover for the plot_timeline method.</li> <li>Added the <code>hdbscan</code> strategy to the Pruner class.</li> <li>The <code>cv</code> key in <code>ht_params</code> now accepts a custom cross-validation generator.</li> <li>Improved error message for incorrect stratification of multioutput datasets.</li> <li>Rework of the shrink method.</li> </ul> <p> Bug fixes</p> <ul> <li>Fixed a bug where the cross_validate method could   fail for pipelines that changed the number of rows.</li> <li>Fixed a bug where the Pruner class didn't drop all outlier clusters.</li> <li>Fixed a bug where the pipeline could fail for transformers that returned a   series.</li> <li>Fixed a bug where the pipeline could fail for transformers that reset its   internal attributes during fitting.</li> <li>Fixed a bug where the register method failed in Databricks.</li> <li>Fixed a bug where tuning hyperparameter for a <code>base_estimator</code> inside a custom   meta-estimator would fail.</li> <li>Fixed a bug where the data properties' <code>@setter</code> could fail for numpy arrays.</li> </ul> <p></p>"}, {"location": "changelog/v5.x.x/#version-520", "title": "Version 5.2.0", "text": "<p> New features</p> <ul> <li>Two new plot methods: plot_terminator_improvement and plot_timeline.</li> </ul> <p> Enhancements</p> <ul> <li>Data splits in every trial are now properly stratified according to the   selected strategy.</li> <li>Performance optimization for multiple methods using smart caching.</li> <li>Improved visualizations for plots with logarithmic hyperparameters.</li> </ul> <p> Bug fixes</p> <ul> <li>Fixed a bug where parameters in a trial would not match with those displayed.</li> </ul> <p></p>"}, {"location": "changelog/v5.x.x/#version-512", "title": "Version 5.1.2", "text": "<p> API changes</p> <ul> <li>The default <code>strategy</code> for the <code>encode</code> method has   changed from \"LeaveOneOut\" to \"Target\"-encoding. LeaveOneOut is no longer a   supported strategy.</li> </ul> <p> Bug fixes</p> <ul> <li>Fixed a bug where stratification failed for datasets where the target column was   not placed last.</li> <li>Fixed a bug where transformers with no <code>get_feature_names_out</code> method could fail.</li> <li>Fixed a bug where the FeatureSelector class could fail when transforming a   dataset with different column order than seen at fit time.</li> </ul> <p></p>"}, {"location": "changelog/v5.x.x/#version-511", "title": "Version 5.1.1", "text": "<p> API changes</p> <ul> <li>The <code>infrequent_to_value</code> parameter in the Encoder class is replaced with   <code>infrequent_to_value</code> to be consistent with sklearn's naming convention.</li> </ul> <p> Enhancements</p> <ul> <li>Added the <code>kwargs</code> parameter to the save_data method.</li> </ul> <p> Bug fixes</p> <ul> <li>Fixed an installation issue for systems without an x86 architecture.</li> <li>Fixed a bug where Voting would fail for certain metrics.</li> <li>Fixed a bug where the time metric in mlflow was always zero.</li> <li>Fixed a bug where shap plots wouldn't display the full column names.</li> <li>Fixed a bug where column names where not properly propagated during   transformation.</li> </ul> <p></p>"}, {"location": "changelog/v5.x.x/#version-510", "title": "Version 5.1.0", "text": "<p> New features</p> <ul> <li>Support for multilabel classification, multiclass-multilabel classification   and multioutput regression tasks. Read more in the user guide.</li> <li>New backend parameter to choose a parallel execution   backend.</li> <li>New <code>parallel</code> parameter to train multiple models   simultaneously.</li> <li>Integration with DAGsHub to store your mlflow experiments.   Read more in the user guide.</li> <li>New serve method to deploy models to a rest API endpoint.</li> <li>New get_best_threshold method to calculate the   optimal threshold for binary and multilabel tasks.</li> <li>New get_sample_weight method to calculate   the sample weights for a balanced data set.</li> </ul> <p> API changes</p> <ul> <li>The <code>ATOMLoader</code> class is deprecated in favor of the load method.</li> <li>The <code>errors</code> attribute for runners is deprecated.</li> </ul> <p> Enhancements</p> <ul> <li>Added three new notebook examples.</li> <li>Added the <code>drop_chars</code> parameter to the Cleaner class.</li> <li>Added the <code>errors</code> parameter to the trainers.</li> <li>Rework of the dependencies, making the base package more lightweight.</li> <li>The logging entries for external libraries are redirected to atom's   file handler.</li> </ul> <p> Bug fixes</p> <ul> <li>Fixed multiple errors that appeared after sklearn's 1.2 update.</li> <li>Fixed a bug where hyperparameter tuning could fail for multi-metric runs.</li> <li>Fixed a bug where trials would try to report multiple times the same step.</li> <li>Fixed a bug where custom models could skip in-training validation.</li> <li>Fixed an issue where the bootstrapping estimators were trained using   <code>partial_fit</code>.</li> </ul> <p></p>"}, {"location": "changelog/v5.x.x/#version-501", "title": "Version 5.0.1", "text": "<p> Bug fixes</p> <ul> <li>Fixed installation issue.</li> <li>Updated package dependencies.</li> </ul> <p></p>"}, {"location": "changelog/v5.x.x/#version-500", "title": "Version 5.0.0", "text": "<p> New features</p> <ul> <li>Completely new hyperparameter tuning process.</li> <li>Completely reworked plotting interface.</li> <li>Accelerate your pipelines with sklearnex.</li> <li>New FeatureGrouper class to extract statistical features from   similar groups.</li> <li>New create_app method to create a nice front-end   for model predictions.</li> <li>New inverse_transform method for   atom and models.</li> <li>New linear model: OrthogonalMatchingPursuit.</li> <li>The plot_results method now accepts time metrics.</li> </ul> <p> API changes</p> <ul> <li>The <code>gpu</code> parameter is deprecated in favor of <code>device</code>   and <code>engine</code>.</li> <li>Refactor of the Cleaner, Discretizer, Encoder and FeatureSelector   classes.</li> <li>Refactor of all shap plots.</li> <li>Refactor of the apply method.</li> <li>The <code>plot_scatter_matrix</code> method is renamed to plot_relationships.</li> <li>The <code>kSVM</code> model is renamed to SVM.</li> <li>Multidimensional datasets are no longer supported. Check the deep learning   section of the user guide for guidance with such datasets.</li> <li>The <code>greater_is_better</code>, <code>needs_proba</code> and <code>needs_threshold</code> parameters are   deprecated. Metric functions are now created using make_scorer's   default parameters.</li> <li>The <code>drop</code> method is removed from atom. Use the reworked apply   method instead.</li> <li>The prediction methods can no longer be called from atom.</li> <li>The dashboard method for models is now called create_dashboard.</li> </ul> <p> Enhancements</p> <ul> <li>New examples for plotting, automated feature scaling,   pruning and advanced hyperparameter tuning.</li> <li>The Normalizer class can now be accelerated with GPU.</li> <li>The Scaler class now ignores binary columns (only 0s and 1s).</li> <li>The <code>models</code> parameter in plot and utility methods now accepts model indices.</li> <li>The transform method now also transforms   only <code>y</code> when <code>X</code> has a default value.</li> <li>The prediction methods now return pandas objects.</li> <li>Dependency versions are checked with originals after unpickling.</li> <li>Automatic generation of documentation from docstrings.</li> <li>Improvements in documentation display for mobile phones.</li> <li>New <code>feature_importance</code> attribute for models.</li> <li>Added a visualization for automated feature scaling to plot_pipeline.</li> </ul> <p> Bug fixes</p> <ul> <li>The FeatureExtractor class no longer raises a warning for highly   fragmented dataframes.</li> <li>Fixed a bug where models could not call the score function.</li> <li>The Encoder class no longer fails when the user provides ordinal   values that are not present during fitting.</li> <li>Fixed a bug with the <code>max_nan_rows</code> parameter in the Imputer class.</li> <li>Fixed a bug where Tokenizer could fail when no ngrams were found.</li> </ul>"}, {"location": "examples/accelerating_cuml/", "title": "Accelerating cuml", "text": "In\u00a0[1]: Copied! <pre>from atom import ATOMClassifier\nfrom sklearn.datasets import make_classification\n\n# Create a dummy dataset\nX, y = make_classification(n_samples=100000, n_features=40)\n</pre> from atom import ATOMClassifier from sklearn.datasets import make_classification  # Create a dummy dataset X, y = make_classification(n_samples=100000, n_features=40) In\u00a0[2]: Copied! <pre>atom = ATOMClassifier(X, y, device=\"gpu\", engine=\"cuml\", verbose=2)\n</pre> atom = ATOMClassifier(X, y, device=\"gpu\", engine=\"cuml\", verbose=2) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\nAlgorithm task: binary classification.\nGPU training enabled.\nBackend engine: cuml.\n\nDataset stats ==================== &gt;&gt;\nShape: (100000, 41)\nMemory: 32.80 MB\nScaled: True\nOutlier values: 8127 (0.2%)\n-------------------------------------\nTrain set size: 80000\nTest set size: 20000\n-------------------------------------\n|   |       dataset |         train |          test |\n| - | ------------- | ------------- | ------------- |\n| 0 |   50006 (1.0) |   40005 (1.0) |   10001 (1.0) |\n| 1 |   49994 (1.0) |   39995 (1.0) |    9999 (1.0) |\n\n</pre> In\u00a0[3]: Copied! <pre>atom.scale()\n</pre> atom.scale() <pre>Fitting Scaler...\nScaling features...\n</pre> In\u00a0[13]: Copied! <pre>atom.dataset\n</pre> atom.dataset Out[13]: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x31 x32 x33 x34 x35 x36 x37 x38 x39 target 0 2.021646 -0.634557 -0.867811 1.103642 1.559011 0.122284 -0.864821 1.411657 0.147997 -2.269082 ... -0.489864 1.861048 -0.353861 0.720823 -1.522117 -0.737707 -1.573936 -0.832174 0.203154 0 1 -0.019885 0.846568 -0.364059 -1.091604 -1.336692 0.186689 -0.274142 0.020563 0.693235 -1.908658 ... -1.610058 -0.365231 0.284908 0.170156 -0.236553 -0.573761 -0.107317 -2.480178 0.420341 0 2 0.516618 -0.013420 -0.753879 -0.488243 0.560051 0.395817 -0.522523 -1.083503 -0.073398 0.383061 ... 0.966283 1.405546 -0.658654 0.339090 -1.615997 -1.312444 0.984578 0.602858 -1.110684 1 3 0.111861 -0.966334 0.208509 0.494328 -0.766835 -0.003399 -0.500449 -0.530622 -0.481663 -1.146132 ... -0.304896 2.030211 -1.189488 -1.238600 1.658765 -0.255644 0.572194 0.195496 0.617734 1 4 0.160135 -0.873517 0.719142 -2.020767 0.421435 -1.941230 0.835615 -1.178845 0.235273 -0.328574 ... 1.633662 -0.631118 1.814046 1.031754 0.328665 1.704483 2.153710 -1.430552 -0.543915 1 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 99995 1.100240 0.092581 -0.346265 0.234024 0.590199 0.755019 -1.688456 -1.031070 -0.620193 -0.283336 ... 0.356480 1.346821 -0.299087 2.343587 -2.003646 -0.933179 0.764255 -0.233526 -1.462311 1 99996 -1.142596 0.321843 -0.974006 0.390418 0.404722 -0.324256 -0.288176 1.009458 0.860912 -0.191313 ... 0.044618 -2.030135 1.448640 -0.854798 1.441451 1.347461 -0.937607 0.572504 -0.787673 0 99997 1.658252 0.303637 -0.020324 0.225917 0.154092 -1.208507 -0.199919 1.063016 -0.395696 -0.060886 ... 1.563345 -1.261853 -0.810122 -0.503823 1.565602 -1.264792 -0.591644 1.588397 0.601721 0 99998 -0.288042 -1.139792 1.548338 0.501413 0.361604 -0.315720 -0.564607 1.500870 0.501768 0.649079 ... 0.344663 1.734476 0.660177 0.767554 1.461940 0.310189 -1.469978 0.900132 1.114330 0 99999 -3.093351 -0.636463 -0.449575 1.169980 -1.041870 -0.257173 2.072777 -0.101111 -0.956916 -0.251162 ... 2.250647 0.746250 -0.610311 0.445467 -0.636288 -0.187444 0.226108 -0.186927 -1.024960 1 <p>100000 rows \u00d7 41 columns</p> In\u00a0[4]: Copied! <pre>print(f\"Scaler used: {atom.standard}\")\nprint(f\"Scaler's module: {atom.standard.__class__.__module__}\")\n</pre> print(f\"Scaler used: {atom.standard}\") print(f\"Scaler's module: {atom.standard.__class__.__module__}\") <pre>Scaler used: StandardScaler()\nScaler's module: cuml._thirdparty.sklearn.preprocessing._data\n</pre> In\u00a0[5]: Copied! <pre>atom.run(models=[\"RF\", \"SGD\", \"XGB\"])\n</pre> atom.run(models=[\"RF\", \"SGD\", \"XGB\"]) <pre>\nTraining ========================= &gt;&gt;\nModels: RF, SGD, XGB\nMetric: f1\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9726\nTest evaluation --&gt; f1: 0.9431\nTime elapsed: 1.935s\n-------------------------------------------------\nTotal time: 1.935s\n\n\nResults for StochasticGradientDescent:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9236\nTest evaluation --&gt; f1: 0.9219\nTime elapsed: 02m:16s\n-------------------------------------------------\nTotal time: 02m:16s\n\n\nResults for XGBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9749\nTest evaluation --&gt; f1: 0.9437\nTime elapsed: 6.394s\n-------------------------------------------------\nTotal time: 6.394s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 02m:24s\n-------------------------------------\nRandomForest              --&gt; f1: 0.9431\nStochasticGradientDescent --&gt; f1: 0.9219\nXGBoost                   --&gt; f1: 0.9437 !\n</pre> In\u00a0[6]: Copied! <pre>atom.results\n</pre> atom.results Out[6]: score_train score_test time_fit time RF 0.9726 0.9431 1.934512 1.934512 SGD 0.9236 0.9219 135.871493 135.871493 XGB 0.9749 0.9437 6.394416 6.394416 In\u00a0[7]: Copied! <pre>for m in atom.models:\n    print(f\"{m}'s module: {atom[m].estimator.__class__.__module__}\")\n</pre> for m in atom.models:     print(f\"{m}'s module: {atom[m].estimator.__class__.__module__}\") <pre>RF's module: cuml.ensemble.randomforestclassifier\nSGD's module: sklearn.linear_model._stochastic_gradient\nXGB's module: xgboost.sklearn\n</pre> In\u00a0[8]: Copied! <pre>atom.evaluate()\n</pre> atom.evaluate() Out[8]: accuracy average_precision balanced_accuracy f1 jaccard matthews_corrcoef precision recall roc_auc RF 0.9429 0.9741 0.9429 0.9431 0.8924 0.8858 0.9391 0.9472 0.9792 SGD 0.9217 0.9635 0.9218 0.9219 0.8551 0.8435 0.9203 0.9235 0.9676 XGB 0.9434 0.9753 0.9434 0.9437 0.8933 0.8868 0.9385 0.9489 0.9798"}, {"location": "examples/accelerating_cuml/#example-accelerating-pipelines-on-gpu", "title": "Example: Accelerating pipelines on GPU\u00b6", "text": "<p>This example shows how to accelerate a pipeline on GPU using cuML.</p> <p>The data used is a synthetic dataset created using sklearn's make_classification function.</p>"}, {"location": "examples/accelerating_sklearnex/", "title": "Accelerating sklearnex", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X, \"RainTomorrow\", verbose=2)\n</pre> atom = ATOMClassifier(X, \"RainTomorrow\", verbose=2) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\n</pre> In\u00a0[4]: Copied! <pre># Impute missing values and encode categorical columns\natom.impute()\natom.encode()\n</pre> # Impute missing values and encode categorical columns atom.impute() atom.encode() <pre>Fitting Imputer...\nImputing missing values...\n --&gt; Dropping 637 samples due to missing values in feature MinTemp.\n --&gt; Dropping 322 samples due to missing values in feature MaxTemp.\n --&gt; Dropping 1406 samples due to missing values in feature Rainfall.\n --&gt; Dropping 60843 samples due to missing values in feature Evaporation.\n --&gt; Dropping 67816 samples due to missing values in feature Sunshine.\n --&gt; Dropping 9330 samples due to missing values in feature WindGustDir.\n --&gt; Dropping 9270 samples due to missing values in feature WindGustSpeed.\n --&gt; Dropping 10013 samples due to missing values in feature WindDir9am.\n --&gt; Dropping 3778 samples due to missing values in feature WindDir3pm.\n --&gt; Dropping 1348 samples due to missing values in feature WindSpeed9am.\n --&gt; Dropping 2630 samples due to missing values in feature WindSpeed3pm.\n --&gt; Dropping 1774 samples due to missing values in feature Humidity9am.\n --&gt; Dropping 3610 samples due to missing values in feature Humidity3pm.\n --&gt; Dropping 14014 samples due to missing values in feature Pressure9am.\n --&gt; Dropping 13981 samples due to missing values in feature Pressure3pm.\n --&gt; Dropping 53657 samples due to missing values in feature Cloud9am.\n --&gt; Dropping 57094 samples due to missing values in feature Cloud3pm.\n --&gt; Dropping 904 samples due to missing values in feature Temp9am.\n --&gt; Dropping 2726 samples due to missing values in feature Temp3pm.\n --&gt; Dropping 1406 samples due to missing values in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 26 classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n</pre> In\u00a0[5]: Copied! <pre># Train a K-Nearest Neighbors model (using default sklearn)\natom.run(models=\"KNN\", metric=\"f1\")\n</pre> # Train a K-Nearest Neighbors model (using default sklearn) atom.run(models=\"KNN\", metric=\"f1\") <pre>\nTraining ========================= &gt;&gt;\nModels: KNN\nMetric: f1\n\n\nResults for KNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.7135\nTest evaluation --&gt; f1: 0.5904\nTime elapsed: 4.239s\n-------------------------------------------------\nTime: 4.239s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 8.264s\n-------------------------------------\nKNearestNeighbors --&gt; f1: 0.5904\n</pre> In\u00a0[7]: Copied! <pre># Now, we train an accelerated KNN using engine=\"sklearnex\"\n# Note the diffrence in training speed!!\natom.run(models=\"KNN_acc\", metric=\"f1\", engine={\"estimator\": \"sklearnex\"})\n</pre> # Now, we train an accelerated KNN using engine=\"sklearnex\" # Note the diffrence in training speed!! atom.run(models=\"KNN_acc\", metric=\"f1\", engine={\"estimator\": \"sklearnex\"}) <pre>\nTraining ========================= &gt;&gt;\nModels: KNN_acc\nMetric: f1\n\n\nResults for KNearestNeighbors:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.7135\nTest evaluation --&gt; f1: 0.5904\nTime elapsed: 1.185s\n-------------------------------------------------\nTime: 1.185s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 2.226s\n-------------------------------------\nKNearestNeighbors --&gt; f1: 0.5904\n</pre> In\u00a0[8]: Copied! <pre>atom.results\n</pre> atom.results Out[8]: f1_train f1_test time_fit time KNN 0.7135 0.5904 4.238729 4.238729 KNN_acc 0.7135 0.5904 1.184578 1.184578 In\u00a0[9]: Copied! <pre># Note how the underlying estimators might look the same...\nprint(atom.knn.estimator)\nprint(atom.knn_acc.estimator)\n\n# ... but are using different implementations\nprint(atom.knn.estimator.__module__)\nprint(atom.knn_acc.estimator.__module__)\n</pre> # Note how the underlying estimators might look the same... print(atom.knn.estimator) print(atom.knn_acc.estimator)  # ... but are using different implementations print(atom.knn.estimator.__module__) print(atom.knn_acc.estimator.__module__) <pre>KNeighborsClassifier(n_jobs=1)\nKNeighborsClassifier(n_jobs=1)\nsklearn.neighbors._classification\nsklearnex.neighbors.knn_classification\n</pre> In\u00a0[10]: Copied! <pre>with atom.canvas(1, 2, title=\"Timing engines: sklearn vs sklearnex\"):\n    atom.plot_results(metric=\"time_fit\", title=\"Training\")\n    atom.plot_results(metric=\"time\", title=\"Total\")\n</pre> with atom.canvas(1, 2, title=\"Timing engines: sklearn vs sklearnex\"):     atom.plot_results(metric=\"time_fit\", title=\"Training\")     atom.plot_results(metric=\"time\", title=\"Total\")"}, {"location": "examples/accelerating_sklearnex/#example-accelerating-pipelines", "title": "Example: Accelerating pipelines\u00b6", "text": "<p>This example shows how to accelerate your models on cpu using sklearnex.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/accelerating_sklearnex/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/accelerating_sklearnex/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/accelerating_sklearnex/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/advanced_plotting/", "title": "Advanced plotting", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=1)\natom.impute()\natom.encode()\n</pre> atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=1) atom.impute() atom.encode() <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nFitting Imputer...\nImputing missing values...\nFitting Encoder...\nEncoding categorical columns...\n</pre> In\u00a0[4]: Copied! <pre># Let's see how the default aesthetics looks like\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n</pre> # Let's see how the default aesthetics looks like atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[5]: Copied! <pre># Change the color palette using color names or their hex codes\natom.palette = [\"red\", \"#00f\"]\n</pre> # Change the color palette using color names or their hex codes atom.palette = [\"red\", \"#00f\"] In\u00a0[6]: Copied! <pre>atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n</pre> atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[7]: Copied! <pre># Change the title and label fontsize\natom.title_fontsize = 30\natom.label_fontsize = 24\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n</pre> # Change the title and label fontsize atom.title_fontsize = 30 atom.label_fontsize = 24 atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[8]: Copied! <pre># Use the update_layout method to change layout properties\natom.update_layout(template=\"simple_white\", barmode=\"group\", hovermode=\"x\")\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n</pre> # Use the update_layout method to change layout properties atom.update_layout(template=\"simple_white\", barmode=\"group\", hovermode=\"x\") atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[9]: Copied! <pre># Use the update_traces method to change the trace (note the y-axis)\natom.update_traces(histnorm=\"percent\", selector=dict(type=\"histogram\"))\natom.plot_distribution(columns=[1, 2], distributions=None, title=\"Distribution of temperatures\")\n</pre> # Use the update_traces method to change the trace (note the y-axis) atom.update_traces(histnorm=\"percent\", selector=dict(type=\"histogram\")) atom.plot_distribution(columns=[1, 2], distributions=None, title=\"Distribution of temperatures\") In\u00a0[10]: Copied! <pre># Let's go back to the default aesthetics\natom.reset_aesthetics()\natom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\")\n</pre> # Let's go back to the default aesthetics atom.reset_aesthetics() atom.plot_distribution(columns=[1, 2], title=\"Distribution of temperatures\") In\u00a0[11]: Copied! <pre># And update the title with some custom fonts\natom.plot_distribution(\n    columns=[1, 2],\n    title=dict(\n        text=\"Distribution of temperatures\",\n        font_color=\"teal\",\n        x=0,\n        xanchor=\"left\",\n    )\n)\n</pre> # And update the title with some custom fonts atom.plot_distribution(     columns=[1, 2],     title=dict(         text=\"Distribution of temperatures\",         font_color=\"teal\",         x=0,         xanchor=\"left\",     ) ) In\u00a0[12]: Copied! <pre># We can update the legend in a similar fashion\natom.plot_distribution(\n    columns=[1, 2],\n    title=dict(\n        text=\"Distribution of temperatures\",\n        font_color=\"teal\",\n        x=0,\n        xanchor=\"left\",\n    ),\n    legend=dict(title=\"Legend's title\"),\n)\n</pre> # We can update the legend in a similar fashion atom.plot_distribution(     columns=[1, 2],     title=dict(         text=\"Distribution of temperatures\",         font_color=\"teal\",         x=0,         xanchor=\"left\",     ),     legend=dict(title=\"Legend's title\"), ) In\u00a0[13]: Copied! <pre>atom.run(\"LR\")\n\n# You can plot the ROC curve for a selection of rows,\n# for example, for rows in a specific location\natom.plot_roc(\n    rows={\n        \"Portland\": atom.test.loc[atom.og.X.Location == \"Portland\"],\n        \"Sydney\": atom.test.loc[atom.og.X.Location == \"Sydney\"],\n    }\n)\n</pre> atom.run(\"LR\")  # You can plot the ROC curve for a selection of rows, # for example, for rows in a specific location atom.plot_roc(     rows={         \"Portland\": atom.test.loc[atom.og.X.Location == \"Portland\"],         \"Sydney\": atom.test.loc[atom.og.X.Location == \"Sydney\"],     } ) <pre>\nTraining ========================= &gt;&gt;\nModels: LR\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6247\nTest evaluation --&gt; f1: 0.6093\nTime elapsed: 0.636s\n-------------------------------------------------\nTime: 0.636s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.044s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.6093\n</pre> In\u00a0[14]: Copied! <pre># Note how the same column over different plots is grouped\nwith atom.canvas(2, 2):\n    atom.plot_distribution(columns=1)\n    atom.plot_distribution(columns=2)\n    atom.plot_qq(columns=[1, 2], distributions=[\"norm\", \"invgauss\"])\n    atom.plot_qq(columns=[1, 2])\n</pre> # Note how the same column over different plots is grouped with atom.canvas(2, 2):     atom.plot_distribution(columns=1)     atom.plot_distribution(columns=2)     atom.plot_qq(columns=[1, 2], distributions=[\"norm\", \"invgauss\"])     atom.plot_qq(columns=[1, 2])"}, {"location": "examples/advanced_plotting/#example-advanced-plotting", "title": "Example: Advanced plotting\u00b6", "text": "<p>This example shows how to make the best use of all of atom's plotting options.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/advanced_plotting/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-colors-and-font-size", "title": "Customize colors and font size\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-the-plots-layout", "title": "Customize the plot's layout\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-the-plots-traces", "title": "Customize the plot's traces\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customize-the-title-and-legend", "title": "Customize the title and legend\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#customizing-the-rows-to-plot", "title": "Customizing the rows to plot\u00b6", "text": ""}, {"location": "examples/advanced_plotting/#using-a-canvas", "title": "Using a canvas\u00b6", "text": ""}, {"location": "examples/automated_feature_scaling/", "title": "Automated feature scaling", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom atom import ATOMClassifier\n</pre> # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX, y = load_breast_cancer(return_X_y=True)\n</pre> # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X, y, verbose=2, random_state=1)\n</pre> atom = ATOMClassifier(X, y, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n</pre> In\u00a0[4]: Copied! <pre># Check which models require feature scaling\natom.available_models()[[\"acronym\", \"model\", \"needs_scaling\"]]\n</pre> # Check which models require feature scaling atom.available_models()[[\"acronym\", \"model\", \"needs_scaling\"]] Out[4]: acronym model needs_scaling 0 AdaB AdaBoost False 1 Bag Bagging False 2 BNB BernoulliNB False 3 CatB CatBoost True 4 CatNB CategoricalNB False 5 CNB ComplementNB False 6 Tree DecisionTree False 7 Dummy Dummy False 8 ETree ExtraTree False 9 ET ExtraTrees False 10 GNB GaussianNB False 11 GP GaussianProcess False 12 GBM GradientBoostingMachine False 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 LGB LightGBM True 16 LDA LinearDiscriminantAnalysis False 17 lSVM LinearSVM True 18 LR LogisticRegression True 19 MLP MultiLayerPerceptron True 20 MNB MultinomialNB False 21 PA PassiveAggressive True 22 Perc Perceptron True 23 QDA QuadraticDiscriminantAnalysis False 24 RNN RadiusNearestNeighbors True 25 RF RandomForest False 26 Ridge Ridge True 27 SGD StochasticGradientDescent True 28 SVM SupportVectorMachine True 29 XGB XGBoost True In\u00a0[5]: Copied! <pre># We fit two models: LR needs scaling and Bag doesn't\natom.run([\"LR\", \"Bag\"])\n</pre> # We fit two models: LR needs scaling and Bag doesn't atom.run([\"LR\", \"Bag\"]) <pre>\nTraining ========================= &gt;&gt;\nModels: LR, Bag\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9913\nTest evaluation --&gt; f1: 0.9861\nTime elapsed: 0.051s\n-------------------------------------------------\nTime: 0.051s\n\n\nResults for Bagging:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9982\nTest evaluation --&gt; f1: 0.9444\nTime elapsed: 0.111s\n-------------------------------------------------\nTime: 0.111s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.216s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9861 !\nBagging            --&gt; f1: 0.9444\n</pre> In\u00a0[6]: Copied! <pre># Now, we create a new branch and scale the features before fitting the model\natom.branch = \"scaling\"\n</pre> # Now, we create a new branch and scale the features before fitting the model atom.branch = \"scaling\" <pre>Successfully created new branch: scaling.\n</pre> In\u00a0[7]: Copied! <pre>atom.scale()\n</pre> atom.scale() <pre>Fitting Scaler...\nScaling features...\n</pre> In\u00a0[8]: Copied! <pre>atom.run(\"LR_2\")\n</pre> atom.run(\"LR_2\") <pre>\nTraining ========================= &gt;&gt;\nModels: LR_2\nMetric: f1\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9913\nTest evaluation --&gt; f1: 0.9861\nTime elapsed: 0.035s\n-------------------------------------------------\nTime: 0.035s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.057s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.9861\n</pre> In\u00a0[9]: Copied! <pre># Let's compare the differences between the models\nprint(atom.lr.scaler)\nprint(atom.bag.scaler)\nprint(atom.lr_2.scaler)\n</pre> # Let's compare the differences between the models print(atom.lr.scaler) print(atom.bag.scaler) print(atom.lr_2.scaler) <pre>Scaler()\nNone\nNone\n</pre> In\u00a0[10]: Copied! <pre># And the data they use is different\nprint(atom.lr.X.iloc[:5, :3])\nprint(\"-----------------------------\")\nprint(atom.bag.X.iloc[:5, :3])\nprint(\"-----------------------------\")\nprint(atom.lr_2.X_train.equals(atom.lr.X_train))\n</pre> # And the data they use is different print(atom.lr.X.iloc[:5, :3]) print(\"-----------------------------\") print(atom.bag.X.iloc[:5, :3]) print(\"-----------------------------\") print(atom.lr_2.X_train.equals(atom.lr.X_train)) <pre>         x0        x1        x2\n0 -0.181875  0.356669 -0.147122\n1  1.162216  0.300578  1.159704\n2  1.056470  1.212060  0.933833\n3  0.277287  2.457753  0.188054\n4 -1.442482 -0.825921 -1.343434\n-----------------------------\n      x0     x1      x2\n0  13.48  20.82   88.40\n1  18.31  20.58  120.80\n2  17.93  24.48  115.20\n3  15.13  29.81   96.71\n4   8.95  15.76   58.74\n-----------------------------\nTrue\n</pre> In\u00a0[11]: Copied! <pre># Note that the scaler is included in the model's pipeline\nprint(atom.lr.pipeline)\nprint(\"-----------------------------\")\nprint(atom.bag.pipeline)\nprint(\"-----------------------------\")\nprint(atom.lr_2.pipeline)\n</pre> # Note that the scaler is included in the model's pipeline print(atom.lr.pipeline) print(\"-----------------------------\") print(atom.bag.pipeline) print(\"-----------------------------\") print(atom.lr_2.pipeline) <pre>Pipeline(memory=Memory(location=None), steps=[('AutomatedScaler', Scaler())])\n-----------------------------\nPipeline(memory=Memory(location=None), steps=[])\n-----------------------------\nPipeline(memory=Memory(location=None), steps=[('Scaler', Scaler(verbose=2))])\n</pre> In\u00a0[12]: Copied! <pre>atom.plot_pipeline()\n</pre> atom.plot_pipeline()"}, {"location": "examples/automated_feature_scaling/#example-automated-feature-scaling", "title": "Example: Automated feature scaling\u00b6", "text": "<p>This example shows how ATOM handles models that require automated feature scaling.</p> <p>Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.</p>"}, {"location": "examples/automated_feature_scaling/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/automated_feature_scaling/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/automated_feature_scaling/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/binary_classification/", "title": "Binary classification", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre># Call atom using only 5% of the complete dataset (for explanatory purposes)\natom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=0.05, n_jobs=8, verbose=2)\n</pre> # Call atom using only 5% of the complete dataset (for explanatory purposes) atom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=0.05, n_jobs=8, verbose=2) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\nParallel processing with 8 cores.\nParallelization backend: loky\n\nDataset stats ==================== &gt;&gt;\nShape: (7109, 22)\nTrain set size: 5688\nTest set size: 1421\n-------------------------------------\nMemory: 1.25 MB\nScaled: False\nMissing values: 15868 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 1 (0.0%)\n\n</pre> In\u00a0[4]: Copied! <pre># Impute missing values\natom.impute(strat_num=\"median\", strat_cat=\"drop\", max_nan_rows=0.8)\n</pre> # Impute missing values atom.impute(strat_num=\"median\", strat_cat=\"drop\", max_nan_rows=0.8) <pre>Fitting Imputer...\nImputing missing values...\n --&gt; Dropping 7 samples for containing more than 16 missing values.\n --&gt; Imputing 23 missing values with median (11.9) in feature MinTemp.\n --&gt; Imputing 10 missing values with median (22.6) in feature MaxTemp.\n --&gt; Imputing 72 missing values with median (0.0) in feature Rainfall.\n --&gt; Imputing 3059 missing values with median (4.6) in feature Evaporation.\n --&gt; Imputing 3382 missing values with median (8.5) in feature Sunshine.\n --&gt; Dropping 467 samples due to missing values in feature WindGustDir.\n --&gt; Imputing 466 missing values with median (39.0) in feature WindGustSpeed.\n --&gt; Dropping 479 samples due to missing values in feature WindDir9am.\n --&gt; Dropping 165 samples due to missing values in feature WindDir3pm.\n --&gt; Imputing 53 missing values with median (13.0) in feature WindSpeed9am.\n --&gt; Imputing 115 missing values with median (17.0) in feature WindSpeed3pm.\n --&gt; Imputing 72 missing values with median (70.0) in feature Humidity9am.\n --&gt; Imputing 164 missing values with median (52.0) in feature Humidity3pm.\n --&gt; Imputing 699 missing values with median (1017.7) in feature Pressure9am.\n --&gt; Imputing 699 missing values with median (1015.4) in feature Pressure3pm.\n --&gt; Imputing 2698 missing values with median (5.0) in feature Cloud9am.\n --&gt; Imputing 2903 missing values with median (5.0) in feature Cloud3pm.\n --&gt; Imputing 32 missing values with median (16.7) in feature Temp9am.\n --&gt; Imputing 116 missing values with median (21.1) in feature Temp3pm.\n --&gt; Dropping 72 samples due to missing values in feature RainToday.\n</pre> In\u00a0[5]: Copied! <pre># Encode the categorical features\natom.encode(strategy=\"Target\", max_onehot=10, infrequent_to_value=0.04)\n</pre> # Encode the categorical features atom.encode(strategy=\"Target\", max_onehot=10, infrequent_to_value=0.04) <pre>Fitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 47 classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n</pre> In\u00a0[6]: Copied! <pre># Train an Extra-Trees and a Random Forest model\natom.run(models=[\"ET\", \"RF\"], metric=\"f1\", n_bootstrap=5)\n</pre> # Train an Extra-Trees and a Random Forest model atom.run(models=[\"ET\", \"RF\"], metric=\"f1\", n_bootstrap=5) <pre>\nTraining ========================= &gt;&gt;\nModels: ET, RF\nMetric: f1\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.5956\nTime elapsed: 1.414s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5709 \u00b1 0.0198\nTime elapsed: 1.020s\n-------------------------------------------------\nTime: 2.434s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.6124\nTime elapsed: 0.337s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5802 \u00b1 0.0111\nTime elapsed: 1.281s\n-------------------------------------------------\nTime: 1.618s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 4.225s\n-------------------------------------\nExtraTrees   --&gt; f1: 0.5709 \u00b1 0.0198 ~\nRandomForest --&gt; f1: 0.5802 \u00b1 0.0111 ~ !\n</pre> In\u00a0[7]: Copied! <pre># Let's have a look at the final results\natom.results\n</pre> # Let's have a look at the final results atom.results Out[7]: f1_train f1_test time_fit f1_bootstrap time_bootstrap time ET 0.8503 0.5688 1.414043 0.570892 1.019728 2.433771 RF 0.8552 0.5612 0.336765 0.580178 1.281000 1.617765 In\u00a0[8]: Copied! <pre># Visualize the bootstrap results\natom.plot_results(title=\"RF vs ET performance\")\n</pre> # Visualize the bootstrap results atom.plot_results(title=\"RF vs ET performance\") In\u00a0[9]: Copied! <pre># Print the results of some common metrics\natom.evaluate()\n</pre> # Print the results of some common metrics atom.evaluate() Out[9]: accuracy ap ba f1 jaccard mcc precision recall auc ET 0.8478 0.6904 0.7059 0.5688 0.3974 0.5108 0.7750 0.4493 0.8561 RF 0.8405 0.6775 0.7038 0.5612 0.3901 0.4891 0.7283 0.4565 0.8502 In\u00a0[10]: Copied! <pre># The winner attribute calls the best model (atom.winner == atom.rf)\nprint(f\"The winner is the {atom.winner.name} model!!\")\n</pre> # The winner attribute calls the best model (atom.winner == atom.rf) print(f\"The winner is the {atom.winner.name} model!!\") <pre>The winner is the RF model!!\n</pre> In\u00a0[11]: Copied! <pre># Visualize the distribution of predicted probabilities\natom.winner.plot_probabilities()\n</pre> # Visualize the distribution of predicted probabilities atom.winner.plot_probabilities() In\u00a0[12]: Copied! <pre># Compare how different metrics perform for different thresholds\natom.winner.plot_threshold(metric=[\"f1\", \"accuracy\", \"ap\"], steps=50)\n</pre> # Compare how different metrics perform for different thresholds atom.winner.plot_threshold(metric=[\"f1\", \"accuracy\", \"ap\"], steps=50)"}, {"location": "examples/binary_classification/#example-binary-classification", "title": "Example: Binary classification\u00b6", "text": "<p>This example shows how to use ATOM to solve a binary classification problem. Additonnaly, we'll perform a variety of data cleaning steps to prepare the data for modeling.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/binary_classification/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/binary_classification/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/binary_classification/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/calibration/", "title": "Calibration", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load the data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X, \"RainTomorrow\", n_rows=1e4, verbose=1, warnings=False)\n\n# Apply data cleaning steps\natom.clean()\natom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")\natom.encode(strategy=\"target\", max_onehot=5, infrequent_to_value=0.05)\n\n# Train a linear SVM\natom.run(\"gnb\")\n</pre> atom = ATOMClassifier(X, \"RainTomorrow\", n_rows=1e4, verbose=1, warnings=False)  # Apply data cleaning steps atom.clean() atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\") atom.encode(strategy=\"target\", max_onehot=5, infrequent_to_value=0.05)  # Train a linear SVM atom.run(\"gnb\") <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (10000, 22)\nTrain set size: 8000\nTest set size: 2000\n-------------------------------------\nMemory: 1.76 MB\nScaled: False\nMissing values: 22184 (10.1%)\nCategorical features: 5 (23.8%)\n\nFitting Cleaner...\nCleaning the data...\nFitting Imputer...\nImputing missing values...\nFitting Encoder...\nEncoding categorical columns...\n\nTraining ========================= &gt;&gt;\nModels: GNB\nMetric: f1\n\n\nResults for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5807\nTest evaluation --&gt; f1: 0.5971\nTime elapsed: 0.094s\n-------------------------------------------------\nTime: 0.094s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.160s\n-------------------------------------\nGaussianNB --&gt; f1: 0.5971\n</pre> In\u00a0[4]: Copied! <pre># Check the model's calibration\natom.plot_calibration()\n</pre> # Check the model's calibration atom.plot_calibration() In\u00a0[5]: Copied! <pre># Let's try to improve it using the calibrate method\natom.winner.calibrate(method=\"isotonic\", cv=5)\n</pre> # Let's try to improve it using the calibrate method atom.winner.calibrate(method=\"isotonic\", cv=5) <pre>Results for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5034\nTest evaluation --&gt; f1: 0.5061\nTime elapsed: 0.282s\n</pre> In\u00a0[6]: Copied! <pre># And check again...\natom.plot_calibration()\n</pre> # And check again... atom.plot_calibration()"}, {"location": "examples/calibration/#example-calibration", "title": "Example: Calibration\u00b6", "text": "<p>This example shows how to calibrate a classifier through atom.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/calibration/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/calibration/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/calibration/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/deep_learning/", "title": "Deep learning", "text": "In\u00a0[1]: Copied! <pre># Disable annoying tf warnings\nimport os\nos.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"\n\nfrom tensorflow import get_logger\nget_logger().setLevel('ERROR')\n\nimport absl.logging\nabsl.logging.set_verbosity(absl.logging.ERROR)\n\nfrom atom import ATOMClassifier, ATOMModel\nfrom sklearn.preprocessing import FunctionTransformer\nfrom optuna.pruners import PatientPruner\nfrom optuna.distributions import CategoricalDistribution, IntDistribution\n\nfrom scikeras.wrappers import KerasClassifier\nfrom keras.datasets import mnist\nfrom keras.models import Sequential\nfrom keras.layers import Dense, Flatten, Conv2D, Dropout\n</pre> # Disable annoying tf warnings import os os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"  from tensorflow import get_logger get_logger().setLevel('ERROR')  import absl.logging absl.logging.set_verbosity(absl.logging.ERROR)  from atom import ATOMClassifier, ATOMModel from sklearn.preprocessing import FunctionTransformer from optuna.pruners import PatientPruner from optuna.distributions import CategoricalDistribution, IntDistribution  from scikeras.wrappers import KerasClassifier from keras.datasets import mnist from keras.models import Sequential from keras.layers import Dense, Flatten, Conv2D, Dropout In\u00a0[2]: Copied! <pre># Download the MNIST dataset\n(X_train, y_train), (X_test, y_test) = mnist.load_data()\n\n# Flatten data to follow sklearn's API (2d input)\nX_train = X_train.reshape(len(X_train), -1)\nX_test = X_test.reshape(len(X_test), -1)\n\ndata = (X_train, y_train), (X_test, y_test)\n</pre> # Download the MNIST dataset (X_train, y_train), (X_test, y_test) = mnist.load_data()  # Flatten data to follow sklearn's API (2d input) X_train = X_train.reshape(len(X_train), -1) X_test = X_test.reshape(len(X_test), -1)  data = (X_train, y_train), (X_test, y_test) In\u00a0[3]: Copied! <pre># Create the convolutional neural network\nclass ConvNN(KerasClassifier):\n    \"\"\"Convolutional neural network model.\"\"\"\n\n    @property\n    def feature_encoder(self):\n        \"\"\"Convert the 2d input to the image's format (len(X), 28, 28, 1).\"\"\"\n        return FunctionTransformer(\n            func=lambda X: X.reshape(X.shape[0], 28, 28, 1),\n        )\n\n    @staticmethod\n    def _keras_build_fn(**kwargs):\n        \"\"\"Create the model's architecture.\"\"\"\n        model = Sequential()\n        model.add(\n            Conv2D(\n                filters=8,\n                kernel_size=3,\n                activation=\"relu\",\n                input_shape=(28, 28, 1),\n            )\n        )\n        model.add(Conv2D(filters=4, kernel_size=5, activation=\"relu\"))\n        model.add(Flatten())\n        model.add(Dense(units=10, activation=\"softmax\"))\n        model.compile(\n            optimizer=\"adam\",\n            loss=\"sparse_categorical_crossentropy\",\n        )\n\n        return model\n</pre> # Create the convolutional neural network class ConvNN(KerasClassifier):     \"\"\"Convolutional neural network model.\"\"\"      @property     def feature_encoder(self):         \"\"\"Convert the 2d input to the image's format (len(X), 28, 28, 1).\"\"\"         return FunctionTransformer(             func=lambda X: X.reshape(X.shape[0], 28, 28, 1),         )      @staticmethod     def _keras_build_fn(**kwargs):         \"\"\"Create the model's architecture.\"\"\"         model = Sequential()         model.add(             Conv2D(                 filters=8,                 kernel_size=3,                 activation=\"relu\",                 input_shape=(28, 28, 1),             )         )         model.add(Conv2D(filters=4, kernel_size=5, activation=\"relu\"))         model.add(Flatten())         model.add(Dense(units=10, activation=\"softmax\"))         model.compile(             optimizer=\"adam\",             loss=\"sparse_categorical_crossentropy\",         )          return model In\u00a0[4]: Copied! <pre># Convert the model to an ATOM model\nmodel = ATOMModel(\n    estimator=ConvNN(verbose=0),\n    acronym=\"CNN\",\n    needs_scaling=True,  # Applies automated feature scaling before fitting\n    has_validation=\"epochs\",  # Applies in-training validation on parameter epochs\n)\n</pre> # Convert the model to an ATOM model model = ATOMModel(     estimator=ConvNN(verbose=0),     acronym=\"CNN\",     needs_scaling=True,  # Applies automated feature scaling before fitting     has_validation=\"epochs\",  # Applies in-training validation on parameter epochs ) In\u00a0[5]: Copied! <pre>atom = ATOMClassifier(*data, n_rows=0.1, verbose=2, random_state=1)\n</pre> atom = ATOMClassifier(*data, n_rows=0.1, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Multiclass classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (7000, 785)\nTrain set size: 6000\nTest set size: 1000\n-------------------------------------\nMemory: 5.54 MB\nScaled: False\nOutlier values: 41839 (0.9%)\n\n</pre> In\u00a0[6]: Copied! <pre># Like any other model, we can define custom distributions for hyperparameter tuning\natom.run(\n    models=model,\n    metric=\"f1_weighted\",\n    n_trials=12,\n    ht_params={\n        \"distributions\": {\n            \"epochs\": IntDistribution(2, 10),\n            \"batch_size\": CategoricalDistribution([128, 256, 512]),\n        },\n    }\n)\n</pre> # Like any other model, we can define custom distributions for hyperparameter tuning atom.run(     models=model,     metric=\"f1_weighted\",     n_trials=12,     ht_params={         \"distributions\": {             \"epochs\": IntDistribution(2, 10),             \"batch_size\": CategoricalDistribution([128, 256, 512]),         },     } ) <pre>\nTraining ========================= &gt;&gt;\nModels: CNN\nMetric: f1_weighted\n\n\nRunning hyperparameter tuning for ConvNN...\n| trial |  epochs | batch_size | f1_weighted | best_f1_weighted | time_trial | time_ht |    state |\n| ----- | ------- | ---------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |       5 |        128 |      0.9147 |           0.9147 |     9.127s |  9.127s | COMPLETE |\n| 1     |       3 |        512 |      0.8539 |           0.9147 |     4.995s | 14.122s | COMPLETE |\n| 2     |       5 |        512 |      0.8931 |           0.9147 |     7.712s | 21.834s | COMPLETE |\n| 3     |       3 |        128 |       0.901 |           0.9147 |     5.706s | 27.540s | COMPLETE |\n| 4     |       5 |        128 |      0.9147 |           0.9147 |     0.607s | 28.147s | COMPLETE |\n| 5     |       9 |        128 |      0.9251 |           0.9251 |    15.297s | 43.443s | COMPLETE |\n| 6     |       9 |        128 |      0.9251 |           0.9251 |     1.230s | 44.673s | COMPLETE |\n| 7     |       3 |        128 |       0.901 |           0.9251 |     0.636s | 45.309s | COMPLETE |\n| 8     |      10 |        256 |      0.8131 |           0.9251 |     2.573s | 47.882s |   PRUNED |\n| 9     |       8 |        128 |      0.9191 |           0.9251 |    14.014s | 01m:02s |   PRUNED |\n| 10    |       7 |        256 |       0.836 |           0.9251 |     2.498s | 01m:04s |   PRUNED |\n| 11    |      10 |        128 |      0.9431 |           0.9431 |    16.725s | 01m:21s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 11\nBest parameters:\n --&gt; epochs: 10\n --&gt; batch_size: 128\nBest evaluation --&gt; f1_weighted: 0.9431\nTime elapsed: 01m:21s\nFit ---------------------------------------------\nTrain evaluation --&gt; f1_weighted: 0.9835\nTest evaluation --&gt; f1_weighted: 0.952\nTime elapsed: 28.600s\n-------------------------------------------------\nTime: 01m:50s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 03m:39s\n-------------------------------------\nConvNN --&gt; f1_weighted: 0.952\n</pre> In\u00a0[7]: Copied! <pre>atom.cnn.trials\n</pre> atom.cnn.trials Out[7]: epochs batch_size estimator f1_weighted best_f1_weighted time_trial time_ht state trial 0 5 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.914748 0.943121 9.126504 9.126504 COMPLETE 1 3 512 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.853919 0.943121 4.995052 14.121556 COMPLETE 2 5 512 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.893128 0.943121 7.712461 21.834017 COMPLETE 3 3 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.900996 0.943121 5.705581 27.539598 COMPLETE 4 5 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.914748 0.943121 0.607057 28.146655 COMPLETE 5 9 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.925088 0.943121 15.296670 43.443325 COMPLETE 6 9 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.925088 0.943121 1.229779 44.673104 COMPLETE 7 3 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.900996 0.943121 0.635578 45.308682 COMPLETE 8 10 256 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.813073 0.943121 2.573343 47.882025 PRUNED 9 8 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.919095 0.943121 14.014060 61.896085 PRUNED 10 7 256 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.835966 0.943121 2.498169 64.394254 PRUNED 11 10 128 ConvNN(\\n\\tmodel=None\\n\\tbuild_fn=None\\n\\twarm... 0.943121 0.943121 16.725048 81.119302 COMPLETE In\u00a0[8]: Copied! <pre>atom.plot_evals(dataset=\"test+train\")\n</pre> atom.plot_evals(dataset=\"test+train\") In\u00a0[9]: Copied! <pre># Use the prediction methods like any other model\natom.cnn.predict_proba(X_train)\n</pre> # Use the prediction methods like any other model atom.cnn.predict_proba(X_train) Out[9]: 0 1 2 3 4 5 6 7 8 9 0 6.981344e-08 1.163047e-08 1.302092e-07 7.298404e-01 4.980663e-11 2.701415e-01 6.764501e-11 1.982446e-06 5.807213e-07 1.532895e-05 1 9.999958e-01 2.160013e-12 2.527803e-06 1.498349e-07 2.094386e-09 4.418725e-07 6.460270e-07 2.255171e-07 2.042284e-08 7.188346e-08 2 1.154879e-10 2.405690e-10 1.185454e-07 3.165163e-07 9.995613e-01 1.887145e-11 6.159564e-12 4.155245e-04 1.546579e-09 2.274483e-05 3 5.565947e-07 9.992028e-01 6.758810e-04 3.334095e-06 2.312364e-05 9.298934e-08 1.309337e-07 7.859311e-05 1.515798e-05 3.681653e-07 4 4.683458e-09 4.092270e-08 3.246872e-07 1.020155e-06 2.804452e-03 9.423515e-08 3.789635e-12 8.406813e-03 7.883451e-05 9.887084e-01 ... ... ... ... ... ... ... ... ... ... ... 59995 7.329114e-09 4.127999e-08 3.695257e-06 1.461548e-04 1.231008e-09 6.157245e-06 2.624072e-11 8.209722e-09 9.998319e-01 1.199038e-05 59996 6.239399e-08 2.397851e-09 1.575265e-03 9.643788e-01 8.514269e-08 1.101398e-04 1.774388e-10 1.135693e-07 3.362476e-02 3.106496e-04 59997 7.059591e-10 5.808693e-09 1.657147e-11 3.829917e-05 3.490374e-07 9.998387e-01 4.054391e-11 4.646493e-11 1.087904e-04 1.385001e-05 59998 1.183419e-05 2.104532e-09 1.940764e-06 1.050059e-07 8.195059e-06 5.124656e-06 9.999721e-01 4.185512e-09 7.723169e-07 1.096977e-09 59999 3.987676e-04 1.140556e-06 4.448286e-04 4.279935e-06 1.410985e-07 2.539659e-03 8.256741e-08 8.921248e-08 9.958331e-01 7.779775e-04 <p>60000 rows \u00d7 10 columns</p> In\u00a0[10]: Copied! <pre># Or make plots...\natom.cnn.plot_hyperparameters()\n</pre> # Or make plots... atom.cnn.plot_hyperparameters() In\u00a0[11]: Copied! <pre>atom.plot_parallel_coordinate()\n</pre> atom.plot_parallel_coordinate()"}, {"location": "examples/deep_learning/#example-deep-learning", "title": "Example: Deep learning\u00b6", "text": "<p>This example shows how to use ATOM to train and validate a Convolutional Neural Network implemented with Keras using scikeras.</p> <p>Import the MNIST dataset from keras.datasets. This is a well known image dataset whose goal is to classify handwritten digits.</p>"}, {"location": "examples/deep_learning/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/deep_learning/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/deep_learning/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/ensembles/", "title": "Ensembles", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom atom import ATOMClassifier\n</pre> # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n</pre> # Load the data X, y = load_breast_cancer(return_X_y=True, as_frame=True) In\u00a0[3]: Copied! <pre># Initialize atom and train several models\natom = ATOMClassifier(X, y, verbose=2, random_state=1)\natom.run(models=[\"LR\", \"Tree\", \"LGB\"], metric=\"accuracy\")\n</pre> # Initialize atom and train several models atom = ATOMClassifier(X, y, verbose=2, random_state=1) atom.run(models=[\"LR\", \"Tree\", \"LGB\"], metric=\"accuracy\") <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 138.97 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n\nTraining ========================= &gt;&gt;\nModels: LR, Tree, LGB\nMetric: accuracy\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; accuracy: 0.989\nTest evaluation --&gt; accuracy: 0.9823\nTime elapsed: 0.048s\n-------------------------------------------------\nTime: 0.048s\n\n\nResults for DecisionTree:\nFit ---------------------------------------------\nTrain evaluation --&gt; accuracy: 1.0\nTest evaluation --&gt; accuracy: 0.9469\nTime elapsed: 0.042s\n-------------------------------------------------\nTime: 0.042s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; accuracy: 1.0\nTest evaluation --&gt; accuracy: 0.9469\nTime elapsed: 0.246s\n-------------------------------------------------\nTime: 0.246s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.419s\n-------------------------------------\nLogisticRegression --&gt; accuracy: 0.9823 !\nDecisionTree       --&gt; accuracy: 0.9469\nLightGBM           --&gt; accuracy: 0.9469\n</pre> In\u00a0[4]: Copied! <pre># Combine the models into a Voting model\natom.voting(voting=\"soft\")\n</pre> # Combine the models into a Voting model atom.voting(voting=\"soft\") <pre>Results for Voting:\nFit ---------------------------------------------\nTrain evaluation --&gt; accuracy: 1.0\nTest evaluation --&gt; accuracy: 0.9469\nTime elapsed: 0.055s\n</pre> In\u00a0[5]: Copied! <pre># Note that we now have an extra model in the pipeline\natom.models\n</pre> # Note that we now have an extra model in the pipeline atom.models Out[5]: <pre>['LR', 'Tree', 'LGB', 'Vote']</pre> In\u00a0[6]: Copied! <pre># The plot_pipeline method helps us visualize the ensemble\natom.plot_pipeline()\n</pre> # The plot_pipeline method helps us visualize the ensemble atom.plot_pipeline() In\u00a0[7]: Copied! <pre># The Vote model averages the scores of the models it contains\natom.vote\n</pre> # The Vote model averages the scores of the models it contains atom.vote Out[7]: <pre>Voting()</pre> In\u00a0[8]: Copied! <pre># We can use it like any other model to make predictions or plots\natom.vote.predict_proba(range(10))\n</pre> # We can use it like any other model to make predictions or plots atom.vote.predict_proba(range(10)) Out[8]: 0 1 0 0.961516 0.038484 1 0.999968 0.000032 2 0.998743 0.001257 3 0.968071 0.031929 4 0.000014 0.999986 5 0.999991 0.000009 6 0.000019 0.999981 7 0.000015 0.999985 8 0.000026 0.999974 9 0.002627 0.997373 In\u00a0[9]: Copied! <pre>atom.vote.plot_threshold(metric=[\"auc\", \"recall\", \"accuracy\"])\n</pre> atom.vote.plot_threshold(metric=[\"auc\", \"recall\", \"accuracy\"]) In\u00a0[10]: Copied! <pre>atom.plot_results(legend=None)\n</pre> atom.plot_results(legend=None) In\u00a0[11]: Copied! <pre>atom.delete(\"vote\")\n</pre> atom.delete(\"vote\") <pre>Deleting 1 models...\n --&gt; Model Vote successfully deleted.\n</pre> In\u00a0[12]: Copied! <pre># Just like Voting, we can create a Stacking model\natom.stacking(final_estimator=\"LDA\")\n</pre> # Just like Voting, we can create a Stacking model atom.stacking(final_estimator=\"LDA\") <pre>Results for Stacking:\nFit ---------------------------------------------\nTrain evaluation --&gt; accuracy: 0.9934\nTest evaluation --&gt; accuracy: 0.9823\nTime elapsed: 0.728s\n</pre> In\u00a0[13]: Copied! <pre># The final estimator uses the predictions of the underlying models\natom.stack.head()\n</pre> # The final estimator uses the predictions of the underlying models atom.stack.head() Out[13]: mean radius mean texture mean perimeter mean area mean smoothness mean compactness mean concavity mean concave points mean symmetry mean fractal dimension ... worst texture worst perimeter worst area worst smoothness worst compactness worst concavity worst concave points worst symmetry worst fractal dimension target 0 13.48 20.82 88.40 559.2 0.10160 0.12550 0.10630 0.05439 0.1720 0.06419 ... 26.02 107.30 740.4 0.1610 0.42250 0.5030 0.22580 0.2807 0.10710 0 1 18.31 20.58 120.80 1052.0 0.10680 0.12480 0.15690 0.09451 0.1860 0.05941 ... 26.20 142.20 1493.0 0.1492 0.25360 0.3759 0.15100 0.3074 0.07863 0 2 17.93 24.48 115.20 998.9 0.08855 0.07027 0.05699 0.04744 0.1538 0.05510 ... 34.69 135.10 1320.0 0.1315 0.18060 0.2080 0.11360 0.2504 0.07948 0 3 15.13 29.81 96.71 719.5 0.08320 0.04605 0.04686 0.02739 0.1852 0.05294 ... 36.91 110.10 931.4 0.1148 0.09866 0.1547 0.06575 0.3233 0.06165 0 4 8.95 15.76 58.74 245.2 0.09462 0.12430 0.09263 0.02308 0.1305 0.07163 ... 17.07 63.34 270.0 0.1179 0.18790 0.1544 0.03846 0.1652 0.07722 1 <p>5 rows \u00d7 31 columns</p> In\u00a0[14]: Copied! <pre># Again, the model can be used for predictions or plots\natom.stack.predict(X)\n</pre> # Again, the model can be used for predictions or plots atom.stack.predict(X) Out[14]: <pre>0      0\n1      0\n2      0\n3      0\n4      1\n      ..\n564    1\n565    0\n566    0\n567    0\n568    1\nName: target, Length: 569, dtype: int64</pre> In\u00a0[15]: Copied! <pre>atom.stack.plot_shap_beeswarm(show=10)\n</pre> atom.stack.plot_shap_beeswarm(show=10) <pre>PermutationExplainer explainer: 114it [00:48,  2.01it/s]                                                                                                                                                                                                                                                             \n</pre>"}, {"location": "examples/ensembles/#example-ensembles", "title": "Example: Ensembles\u00b6", "text": "<p>This example shows how to use atom's ensemble techniques to improve predictions on a dataset combining several models.</p> <p>Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.</p>"}, {"location": "examples/ensembles/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/ensembles/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/ensembles/#voting", "title": "Voting\u00b6", "text": ""}, {"location": "examples/ensembles/#stacking", "title": "Stacking\u00b6", "text": ""}, {"location": "examples/feature_engineering/", "title": "Feature engineering", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre># Initialize atom and apply data cleaning\natom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0)\natom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8)\natom.encode(max_onehot=10, infrequent_to_value=0.04)\n</pre> # Initialize atom and apply data cleaning atom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0) atom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8) atom.encode(max_onehot=10, infrequent_to_value=0.04) In\u00a0[4]: Copied! <pre>atom.verbose = 2  # Increase verbosity to see the output\n\n# Let's see how a LightGBM model performs\natom.run('LGB', metric='auc')\n</pre> atom.verbose = 2  # Increase verbosity to see the output  # Let's see how a LightGBM model performs atom.run('LGB', metric='auc') <pre>\nTraining ========================= &gt;&gt;\nModels: LGB\nMetric: auc\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.9817\nTest evaluation --&gt; auc: 0.8584\nTime elapsed: 0.831s\n-------------------------------------------------\nTime: 0.831s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.963s\n-------------------------------------\nLightGBM --&gt; auc: 0.8584\n</pre> In\u00a0[5]: Copied! <pre># Since we are going to compare different datasets,\n# we need to create separate branches\natom.branch = \"dfs\"\n</pre> # Since we are going to compare different datasets, # we need to create separate branches atom.branch = \"dfs\" <pre>Successfully created new branch: dfs.\n</pre> In\u00a0[6]: Copied! <pre># Create 50 new features using dfs\natom.feature_generation(\"dfs\", n_features=50, operators=[\"add\", \"sub\", \"log\"])\n</pre> # Create 50 new features using dfs atom.feature_generation(\"dfs\", n_features=50, operators=[\"add\", \"sub\", \"log\"]) <pre>Fitting FeatureGenerator...\nGenerating new features...\n --&gt; 50 new features were added.\n</pre> In\u00a0[7]: Copied! <pre># The warnings warn us that some operators created missing values!\n# We can see the columns with missing values using the nans attribute\natom.nans\n</pre> # The warnings warn us that some operators created missing values! # We can see the columns with missing values using the nans attribute atom.nans Out[7]: <pre>Location                       0\nMinTemp                        0\nMaxTemp                        0\nRainfall                       0\nEvaporation                    0\n                              ..\nTemp9am - WindDir3pm           0\nWindDir9am + WindGustSpeed     0\nWindDir9am + WindSpeed3pm      0\nWindGustDir + WindSpeed9am     0\nWindSpeed3pm - WindSpeed9am    0\nLength: 73, dtype: int64</pre> In\u00a0[8]: Copied! <pre># Turn off warnings in the future\natom.warnings = False\n\n# Impute the data again to get rid of the missing values\natom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8)\n</pre> # Turn off warnings in the future atom.warnings = False  # Impute the data again to get rid of the missing values atom.impute(strat_num=\"knn\", strat_cat=\"remove\", max_nan_rows=0.8) <pre>Fitting Imputer...\nImputing missing values...\n --&gt; Imputing 12 missing values using the KNN imputer in feature NATURAL_LOGARITHM(Temp3pm).\n</pre> In\u00a0[9]: Copied! <pre># 50 new features may be to much...\n# Let's check for multicollinearity and use rfecv to reduce the number\natom.feature_selection(\n    strategy=\"rfecv\",\n    solver=\"LGB\",\n    n_features=30,\n    scoring=\"auc\",\n    max_correlation=0.98,\n)\n</pre> # 50 new features may be to much... # Let's check for multicollinearity and use rfecv to reduce the number atom.feature_selection(     strategy=\"rfecv\",     solver=\"LGB\",     n_features=30,     scoring=\"auc\",     max_correlation=0.98, ) <pre>Fitting FeatureSelector...\nPerforming feature selection ...\n --&gt; Feature MinTemp was removed due to collinearity with another feature.\n --&gt; Feature MinTemp + RainToday_No was removed due to collinearity with another feature.\n --&gt; Feature MaxTemp was removed due to collinearity with another feature.\n --&gt; Feature MaxTemp + WindDir3pm was removed due to collinearity with another feature.\n --&gt; Feature MaxTemp + WindGustDir was removed due to collinearity with another feature.\n --&gt; Feature Rainfall was removed due to collinearity with another feature.\n --&gt; Feature Rainfall + RainToday_rare was removed due to collinearity with another feature.\n --&gt; Feature Rainfall + WindDir3pm was removed due to collinearity with another feature.\n --&gt; Feature Sunshine was removed due to collinearity with another feature.\n --&gt; Feature Sunshine - WindDir3pm was removed due to collinearity with another feature.\n --&gt; Feature WindGustSpeed was removed due to collinearity with another feature.\n --&gt; Feature WindSpeed9am was removed due to collinearity with another feature.\n --&gt; Feature WindSpeed3pm was removed due to collinearity with another feature.\n --&gt; Feature Humidity9am was removed due to collinearity with another feature.\n --&gt; Feature Humidity3pm was removed due to collinearity with another feature.\n --&gt; Feature NATURAL_LOGARITHM(Pressure3pm) was removed due to collinearity with another feature.\n --&gt; Feature Pressure3pm - RainToday_Yes was removed due to collinearity with another feature.\n --&gt; Feature Cloud9am + RainToday_No was removed due to collinearity with another feature.\n --&gt; Feature Cloud3pm was removed due to collinearity with another feature.\n --&gt; Feature Cloud3pm + Location was removed due to collinearity with another feature.\n --&gt; Feature Temp9am - WindDir3pm was removed due to collinearity with another feature.\n --&gt; Feature Temp3pm was removed due to collinearity with another feature.\n --&gt; Feature Temp3pm - WindDir9am was removed due to collinearity with another feature.\n --&gt; Feature RainToday_rare was removed due to collinearity with another feature.\n --&gt; rfecv selected 38 features from the dataset.\n   --&gt; Dropping feature Location (rank 12).\n   --&gt; Dropping feature Cloud9am (rank 2).\n   --&gt; Dropping feature RainToday_No (rank 10).\n   --&gt; Dropping feature RainToday_Yes (rank 11).\n   --&gt; Dropping feature Location + RainToday_rare (rank 9).\n   --&gt; Dropping feature Location - Pressure9am (rank 4).\n   --&gt; Dropping feature Location - Temp9am (rank 7).\n   --&gt; Dropping feature Location - WindGustDir (rank 8).\n   --&gt; Dropping feature RainToday_No - WindSpeed3pm (rank 3).\n   --&gt; Dropping feature RainToday_rare + Temp3pm (rank 5).\n   --&gt; Dropping feature Rainfall + RainToday_Yes (rank 6).\n</pre> In\u00a0[10]: Copied! <pre># The collinear attribute shows what features were removed due to multicollinearity\natom.collinear_\n</pre> # The collinear attribute shows what features were removed due to multicollinearity atom.collinear_ Out[10]: drop corr_feature corr_value 0 MinTemp MinTemp + RainToday_No, MinTemp + RainToday_Yes 0.9978, 0.9979 1 MinTemp + RainToday_No MinTemp, MinTemp + RainToday_Yes 0.9978, 0.9914 2 MaxTemp MaxTemp + WindDir3pm, MaxTemp + WindDir9am, Ma... 1.0, 1.0, 1.0 3 MaxTemp + WindDir3pm MaxTemp, MaxTemp + WindDir9am, MaxTemp + WindG... 1.0, 1.0, 1.0 4 MaxTemp + WindGustDir MaxTemp, MaxTemp + WindDir3pm, MaxTemp + WindD... 1.0, 1.0, 1.0 5 Rainfall Rainfall + RainToday_Yes, Rainfall + RainToday... 0.999, 0.9999, 1.0 6 Rainfall + RainToday_rare Rainfall, Rainfall + RainToday_Yes, Rainfall +... 0.9999, 0.9989, 0.9999 7 Rainfall + WindDir3pm Rainfall, Rainfall + RainToday_Yes, Rainfall +... 1.0, 0.999, 0.9999 8 Sunshine RainToday_rare + Sunshine, Sunshine - WindDir3pm 0.9994, 0.9998 9 Sunshine - WindDir3pm Sunshine, RainToday_rare + Sunshine 0.9998, 0.9993 10 WindGustSpeed WindDir9am + WindGustSpeed 1.0 11 WindSpeed9am WindGustDir + WindSpeed9am 1.0 12 WindSpeed3pm WindDir9am + WindSpeed3pm 1.0 13 Humidity9am Humidity9am + WindGustDir 1.0 14 Humidity3pm Humidity3pm - Sunshine 0.9937 15 NATURAL_LOGARITHM(Pressure3pm) Pressure3pm, Pressure3pm - RainToday_Yes 1.0, 0.9981 16 Pressure3pm - RainToday_Yes Pressure3pm, NATURAL_LOGARITHM(Pressure3pm) 0.9981, 0.9981 17 Cloud9am + RainToday_No Cloud9am 0.9828 18 Cloud3pm Cloud3pm + Location, Cloud3pm + RainToday_rare 1.0, 0.9991 19 Cloud3pm + Location Cloud3pm, Cloud3pm + RainToday_rare 1.0, 0.9991 20 Temp9am - WindDir3pm Temp9am 1.0 21 Temp3pm RainToday_rare + Temp3pm, Temp3pm - WindDir9am 0.9999, 1.0 22 Temp3pm - WindDir9am Temp3pm, RainToday_rare + Temp3pm 1.0, 0.9999 23 RainToday_rare Location + RainToday_rare 1.0 In\u00a0[11]: Copied! <pre># After applying rfecv, we can plot the score per number of features\natom.plot_rfecv()\n</pre> # After applying rfecv, we can plot the score per number of features atom.plot_rfecv() In\u00a0[12]: Copied! <pre># Let's see how the model performs now\n# Add a tag to the model's acronym to not overwrite previous LGB\natom.run(\"LGB_dfs\", errors=\"raise\")\n</pre> # Let's see how the model performs now # Add a tag to the model's acronym to not overwrite previous LGB atom.run(\"LGB_dfs\", errors=\"raise\") <pre>\nTraining ========================= &gt;&gt;\nModels: LGB_dfs\nMetric: auc\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.9893\nTest evaluation --&gt; auc: 0.8572\nTime elapsed: 1.045s\n-------------------------------------------------\nTime: 1.045s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.186s\n-------------------------------------\nLightGBM --&gt; auc: 0.8572\n</pre> In\u00a0[13]: Copied! <pre># Create another branch for the genetic features\n# Split form master to avoid the dfs features\natom.branch = \"gfg_from_main\"\n</pre> # Create another branch for the genetic features # Split form master to avoid the dfs features atom.branch = \"gfg_from_main\" <pre>Successfully created new branch: gfg.\n</pre> In\u00a0[14]: Copied! <pre># Create new features using Genetic Programming\natom.feature_generation(strategy='gfg', n_features=20)\n</pre> # Create new features using Genetic Programming atom.feature_generation(strategy='gfg', n_features=20) <pre>Fitting FeatureGenerator...\n    |   Population Average    |             Best Individual              |\n---- ------------------------- ------------------------------------------ ----------\n Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left\n   0     3.08         0.137852        3         0.505879              N/A     18.62s\n   1     3.30         0.332951        6         0.506041              N/A     19.23s\n   2     3.92         0.429317        7         0.525775              N/A     18.31s\n   3     4.64         0.459817        9         0.532823              N/A     16.25s\n   4     6.59         0.475058       11         0.540078              N/A     15.51s\n   5     8.04         0.498345       13          0.54114              N/A     14.56s\n   6     9.80         0.509423       13         0.543911              N/A     13.87s\n   7    10.86         0.513225       15         0.551242              N/A     13.28s\n   8    11.54         0.513973       15         0.554127              N/A     11.99s\n   9    12.21         0.516725       19         0.554172              N/A     11.44s\n  10    13.09         0.520543       17         0.556923              N/A     10.19s\n  11    13.24         0.519283       17         0.556923              N/A      9.07s\n  12    12.74          0.51949       21         0.558114              N/A      7.95s\n  13    13.88         0.521709       21         0.558114              N/A      6.68s\n  14    15.99         0.523381       19         0.558673              N/A      6.12s\n  15    16.74         0.523708       19         0.558673              N/A      7.97s\n  16    16.84         0.524509       19         0.560449              N/A      6.02s\n  17    16.79         0.525061       19         0.560449              N/A      2.26s\n  18    16.77         0.523639       21         0.561281              N/A      1.11s\n  19    17.03         0.524261       23         0.561813              N/A      0.00s\nGenerating new features...\n --&gt; 20 new features were added.\n</pre> In\u00a0[16]: Copied! <pre># We can see the feature's fitness and description through the genetic_features attribute\natom.genetic_features_\n</pre> # We can see the feature's fitness and description through the genetic_features attribute atom.genetic_features_ Out[16]: name description fitness 0 x23 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.541449 1 x24 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.541449 2 x25 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.541449 3 x26 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.541449 4 x27 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.541449 5 x28 mul(add(Cloud3pm, add(Cloud3pm, mul(add(WindGu... 0.541322 6 x29 mul(add(Cloud3pm, mul(Humidity3pm, WindDir3pm)... 0.541229 7 x30 mul(add(Cloud3pm, mul(Humidity3pm, WindDir3pm)... 0.541229 8 x31 mul(add(Cloud3pm, mul(Humidity3pm, WindDir3pm)... 0.540696 9 x32 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540674 10 x33 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.540674 11 x34 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540674 12 x35 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540281 13 x36 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.540281 14 x37 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.539923 15 x38 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.539923 16 x39 mul(add(WindGustSpeed, add(Humidity3pm, Rainfa... 0.539923 17 x40 mul(add(WindGustSpeed, Humidity3pm), mul(add(C... 0.539923 18 x41 mul(mul(add(Cloud3pm, add(Cloud3pm, mul(Humidi... 0.539923 19 x42 mul(add(Cloud3pm, add(Cloud3pm, mul(Humidity3p... 0.539909 In\u00a0[17]: Copied! <pre># Fit the model again\natom.run(\"LGB_gfg\", metric=\"auc\")\n</pre> # Fit the model again atom.run(\"LGB_gfg\", metric=\"auc\") <pre>\nTraining ========================= &gt;&gt;\nModels: LGB_gfg\nMetric: auc\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.9857\nTest evaluation --&gt; auc: 0.8558\nTime elapsed: 1.044s\n-------------------------------------------------\nTime: 1.044s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.227s\n-------------------------------------\nLightGBM --&gt; auc: 0.8558\n</pre> In\u00a0[18]: Copied! <pre># Visualize the whole pipeline\natom.plot_pipeline()\n</pre> # Visualize the whole pipeline atom.plot_pipeline() In\u00a0[19]: Copied! <pre># Use atom's plots to compare the three models\natom.plot_roc(rows=\"test+train\")\n</pre> # Use atom's plots to compare the three models atom.plot_roc(rows=\"test+train\") In\u00a0[23]: Copied! <pre># To compare other plots it might be useful to use a canvas\nwith atom.canvas(1, 2, figsize=(1800, 800)):\n    atom.lgb_dfs.plot_roc(rows=\"test+train\")\n    atom.lgb_dfs.plot_feature_importance(show=10, title=\"LGB + dfs\")\n</pre> # To compare other plots it might be useful to use a canvas with atom.canvas(1, 2, figsize=(1800, 800)):     atom.lgb_dfs.plot_roc(rows=\"test+train\")     atom.lgb_dfs.plot_feature_importance(show=10, title=\"LGB + dfs\") In\u00a0[21]: Copied! <pre># We can check the feature importance with other plots as well\natom.plot_permutation_importance(models=[\"LGB_dfs\", \"LGB_gfg\"], show=12)\n</pre> # We can check the feature importance with other plots as well atom.plot_permutation_importance(models=[\"LGB_dfs\", \"LGB_gfg\"], show=12) In\u00a0[24]: Copied! <pre>atom.LGB_gfg.plot_shap_decision(rows=(0, 10), show=15)\n</pre> atom.LGB_gfg.plot_shap_decision(rows=(0, 10), show=15)"}, {"location": "examples/feature_engineering/#example-feature-engineering", "title": "Example: Feature engineering\u00b6", "text": "<p>This example shows how to use automated feature generation to improve a model's performance.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/feature_engineering/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/feature_engineering/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/feature_engineering/#deep-feature-synthesis", "title": "Deep Feature Synthesis\u00b6", "text": ""}, {"location": "examples/feature_engineering/#genetic-feature-generation", "title": "Genetic Feature Generation\u00b6", "text": ""}, {"location": "examples/feature_engineering/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/getting_started/", "title": "Getting started", "text": "In\u00a0[1]: Copied! <pre>import pandas as pd\nfrom atom import ATOMClassifier\n\n# Load the Australian Weather dataset\nX = pd.read_csv(\"https://raw.githubusercontent.com/tvdboom/ATOM/master/examples/datasets/weatherAUS.csv\")\n</pre> import pandas as pd from atom import ATOMClassifier  # Load the Australian Weather dataset X = pd.read_csv(\"https://raw.githubusercontent.com/tvdboom/ATOM/master/examples/datasets/weatherAUS.csv\") In\u00a0[2]: Copied! <pre>atom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=1000, verbose=2)\n</pre> atom = ATOMClassifier(X, y=\"RainTomorrow\", n_rows=1000, verbose=2) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (1000, 22)\nTrain set size: 800\nTest set size: 200\n-------------------------------------\nMemory: 176.13 kB\nScaled: False\nMissing values: 2260 (10.3%)\nCategorical features: 5 (23.8%)\n\n</pre> In\u00a0[3]: Copied! <pre>atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")  \natom.encode(strategy=\"Target\", max_onehot=8)\n</pre> atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\")   atom.encode(strategy=\"Target\", max_onehot=8) <pre>Fitting Imputer...\nImputing missing values...\n --&gt; Imputing 8 missing values with median (11.6) in feature MinTemp.\n --&gt; Imputing 2 missing values with median (22.3) in feature MaxTemp.\n --&gt; Imputing 12 missing values with median (0.0) in feature Rainfall.\n --&gt; Imputing 425 missing values with median (4.8) in feature Evaporation.\n --&gt; Imputing 480 missing values with median (8.55) in feature Sunshine.\n --&gt; Imputing 59 missing values with most_frequent (N) in feature WindGustDir.\n --&gt; Imputing 59 missing values with median (37.0) in feature WindGustSpeed.\n --&gt; Imputing 90 missing values with most_frequent (N) in feature WindDir9am.\n --&gt; Imputing 28 missing values with most_frequent (SW) in feature WindDir3pm.\n --&gt; Imputing 10 missing values with median (13.0) in feature WindSpeed9am.\n --&gt; Imputing 19 missing values with median (17.0) in feature WindSpeed3pm.\n --&gt; Imputing 17 missing values with median (70.0) in feature Humidity9am.\n --&gt; Imputing 31 missing values with median (51.0) in feature Humidity3pm.\n --&gt; Imputing 89 missing values with median (1017.8) in feature Pressure9am.\n --&gt; Imputing 87 missing values with median (1015.2) in feature Pressure3pm.\n --&gt; Imputing 383 missing values with median (5.0) in feature Cloud9am.\n --&gt; Imputing 412 missing values with median (5.0) in feature Cloud3pm.\n --&gt; Imputing 11 missing values with median (16.5) in feature Temp9am.\n --&gt; Imputing 26 missing values with median (20.7) in feature Temp3pm.\n --&gt; Imputing 12 missing values with most_frequent (No) in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 49 classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n</pre> In\u00a0[4]: Copied! <pre>atom.run(models=[\"LDA\", \"AdaB\"], metric=\"auc\", n_trials=10)\n</pre> atom.run(models=[\"LDA\", \"AdaB\"], metric=\"auc\", n_trials=10) <pre>\nTraining ========================= &gt;&gt;\nModels: LDA, AdaB\nMetric: auc\n\n\nRunning hyperparameter tuning for LinearDiscriminantAnalysis...\n| trial |  solver | shrinkage |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |   eigen |       0.9 |  0.8807 |   0.8807 |     0.162s |  0.162s | COMPLETE |\n| 1     |     svd |       nan |  0.8445 |   0.8807 |     0.147s |  0.309s | COMPLETE |\n| 2     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.310s | COMPLETE |\n| 3     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.311s | COMPLETE |\n| 4     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.312s | COMPLETE |\n| 5     |   eigen |       0.9 |  0.8807 |   0.8807 |     0.000s |  0.312s | COMPLETE |\n| 6     |     svd |       nan |  0.8445 |   0.8807 |     0.000s |  0.312s | COMPLETE |\n| 7     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.313s | COMPLETE |\n| 8     |   eigen |       0.5 |  0.8417 |   0.8807 |     0.143s |  0.456s | COMPLETE |\n| 9     |     svd |       nan |  0.8445 |   0.8807 |     0.001s |  0.457s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 0\nBest parameters:\n --&gt; solver: eigen\n --&gt; shrinkage: 0.9\nBest evaluation --&gt; auc: 0.8807\nTime elapsed: 0.457s\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.8381\nTest evaluation --&gt; auc: 0.8037\nTime elapsed: 0.025s\n-------------------------------------------------\nTime: 0.482s\n\n\nRunning hyperparameter tuning for AdaBoost...\n| trial | n_estimators | learning_rate | algorithm |     auc | best_auc | time_trial | time_ht |    state |\n| ----- | ------------ | ------------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |           90 |        0.4088 |   SAMME.R |  0.8002 |   0.8002 |     0.331s |  0.331s | COMPLETE |\n| 1     |          190 |        0.1019 |   SAMME.R |  0.8294 |   0.8294 |     0.540s |  0.871s | COMPLETE |\n| 2     |          260 |         0.243 |   SAMME.R |   0.754 |   0.8294 |     0.645s |  1.515s | COMPLETE |\n| 3     |          490 |         0.041 |   SAMME.R |  0.7953 |   0.8294 |     1.105s |  2.620s | COMPLETE |\n| 4     |          210 |        0.1604 |     SAMME |  0.7969 |   0.8294 |     0.527s |  3.148s | COMPLETE |\n| 5     |          310 |        0.1504 |     SAMME |  0.7988 |   0.8294 |     0.696s |  3.843s | COMPLETE |\n| 6     |          380 |         2.445 |     SAMME |  0.5978 |   0.8294 |     0.830s |  4.674s | COMPLETE |\n| 7     |          100 |        0.9151 |     SAMME |  0.8372 |   0.8372 |     0.328s |  5.002s | COMPLETE |\n| 8     |          350 |        8.9334 |     SAMME |  0.6751 |   0.8372 |     0.786s |  5.787s | COMPLETE |\n| 9     |          450 |        0.1974 |     SAMME |    0.82 |   0.8372 |     0.969s |  6.757s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 7\nBest parameters:\n --&gt; n_estimators: 100\n --&gt; learning_rate: 0.9151\n --&gt; algorithm: SAMME\nBest evaluation --&gt; auc: 0.8372\nTime elapsed: 6.757s\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.9133\nTest evaluation --&gt; auc: 0.8353\nTime elapsed: 0.232s\n-------------------------------------------------\nTime: 6.989s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 9.134s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; auc: 0.8037\nAdaBoost                   --&gt; auc: 0.8353 !\n</pre> In\u00a0[5]: Copied! <pre>atom.evaluate()\n</pre> atom.evaluate() Out[5]: accuracy ap ba f1 jaccard mcc precision recall auc LDA 0.785 0.5888 0.7533 0.5825 0.4110 0.4542 0.5000 0.6977 0.8037 AdaB 0.820 0.5801 0.7165 0.5610 0.3898 0.4490 0.5897 0.5349 0.8353"}, {"location": "examples/getting_started/#example-getting-started", "title": "Example: Getting started\u00b6", "text": "<p>This example shows how to get started with the atom-ml library.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/holdout_set/", "title": "Holdout set", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre># Initialize atom specifying a fraction of the dataset for holdout\natom = ATOMClassifier(X, n_rows=0.5, holdout_size=0.2, verbose=2)\n</pre> # Initialize atom specifying a fraction of the dataset for holdout atom = ATOMClassifier(X, n_rows=0.5, holdout_size=0.2, verbose=2) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (56877, 22)\nTrain set size: 42658\nTest set size: 14219\nHoldout set size: 14219\n-------------------------------------\nMemory: 10.01 MB\nScaled: False\nMissing values: 126822 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 15 (0.0%)\n\n</pre> In\u00a0[4]: Copied! <pre># The test and holdout fractions are split after subsampling the dataset\n# Also note that the holdout data set is not a part of atom's dataset\nprint(\"Length loaded data:\", len(X))\nprint(\"Length dataset + holdout:\", len(atom.dataset) + len(atom.holdout))\n</pre> # The test and holdout fractions are split after subsampling the dataset # Also note that the holdout data set is not a part of atom's dataset print(\"Length loaded data:\", len(X)) print(\"Length dataset + holdout:\", len(atom.dataset) + len(atom.holdout)) <pre>Length loaded data: 142193\nLength dataset + holdout: 71096\n</pre> In\u00a0[5]: Copied! <pre>atom.impute()\natom.encode()\n</pre> atom.impute() atom.encode() <pre>Fitting Imputer...\nImputing missing values...\n --&gt; Dropping 258 samples due to missing values in feature MinTemp.\n --&gt; Dropping 127 samples due to missing values in feature MaxTemp.\n --&gt; Dropping 553 samples due to missing values in feature Rainfall.\n --&gt; Dropping 24308 samples due to missing values in feature Evaporation.\n --&gt; Dropping 27187 samples due to missing values in feature Sunshine.\n --&gt; Dropping 3739 samples due to missing values in feature WindGustDir.\n --&gt; Dropping 3712 samples due to missing values in feature WindGustSpeed.\n --&gt; Dropping 3995 samples due to missing values in feature WindDir9am.\n --&gt; Dropping 1508 samples due to missing values in feature WindDir3pm.\n --&gt; Dropping 539 samples due to missing values in feature WindSpeed9am.\n --&gt; Dropping 1077 samples due to missing values in feature WindSpeed3pm.\n --&gt; Dropping 706 samples due to missing values in feature Humidity9am.\n --&gt; Dropping 1447 samples due to missing values in feature Humidity3pm.\n --&gt; Dropping 5610 samples due to missing values in feature Pressure9am.\n --&gt; Dropping 5591 samples due to missing values in feature Pressure3pm.\n --&gt; Dropping 21520 samples due to missing values in feature Cloud9am.\n --&gt; Dropping 22921 samples due to missing values in feature Cloud3pm.\n --&gt; Dropping 365 samples due to missing values in feature Temp9am.\n --&gt; Dropping 1106 samples due to missing values in feature Temp3pm.\n --&gt; Dropping 553 samples due to missing values in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 26 classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n</pre> In\u00a0[6]: Copied! <pre># Unlike train and test, the holdout data set is not transformed until used for predictions\natom.holdout\n</pre> # Unlike train and test, the holdout data set is not transformed until used for predictions atom.holdout Out[6]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 22540 NorahHead 15.8 23.7 0.4 NaN NaN SSW 50.0 NW NaN ... 79.0 80.0 1012.4 1009.6 NaN NaN 18.4 18.9 No 0 22541 Brisbane 13.0 24.1 0.0 3.2 3.6 W 24.0 SW WSW ... 53.0 27.0 1019.9 1015.9 7.0 8.0 17.3 22.1 No 0 22542 MountGambier 14.7 36.2 0.0 7.2 12.5 S 33.0 N SSW ... 52.0 27.0 1018.8 1017.4 7.0 2.0 25.2 35.4 No 0 22543 Launceston 12.3 21.4 0.0 NaN NaN NNW 52.0 NNW NNW ... 62.0 60.0 NaN NaN 5.0 8.0 16.2 20.4 No 0 22544 MountGinini 3.2 10.0 0.0 NaN NaN WSW 52.0 WSW WSW ... 97.0 95.0 NaN NaN NaN NaN 6.5 8.4 No 0 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 36754 MountGinini 1.6 4.4 0.0 NaN NaN E 52.0 E E ... 100.0 100.0 NaN NaN NaN NaN 2.7 2.6 No 1 36755 WaggaWagga 9.9 21.8 0.0 4.6 5.7 WSW 35.0 S SW ... 57.0 36.0 1015.5 1013.7 7.0 7.0 17.0 21.3 No 0 36756 Walpole 8.8 16.3 0.8 NaN NaN NNW 37.0 NNE N ... 84.0 79.0 1018.4 1013.5 NaN NaN 11.0 14.6 No 1 36757 Dartmoor 8.7 15.5 2.0 1.4 5.4 S 30.0 WSW SSW ... 100.0 94.0 1018.6 1020.0 NaN NaN 12.9 12.8 Yes 0 36758 SydneyAirport 16.8 22.6 8.4 5.0 3.8 S 57.0 WNW S ... 79.0 75.0 1013.2 1013.7 8.0 6.0 17.1 18.8 Yes 0 <p>14219 rows \u00d7 22 columns</p> In\u00a0[7]: Copied! <pre>atom.run(models=[\"GNB\", \"LR\", \"RF\"])\n</pre> atom.run(models=[\"GNB\", \"LR\", \"RF\"]) <pre>\nTraining ========================= &gt;&gt;\nModels: GNB, LR, RF\nMetric: f1\n\n\nResults for GaussianNB:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.604\nTest evaluation --&gt; f1: 0.6063\nTime elapsed: 0.209s\n-------------------------------------------------\nTime: 0.209s\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6188\nTest evaluation --&gt; f1: 0.6162\nTime elapsed: 0.323s\n-------------------------------------------------\nTime: 0.323s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 1.0\nTest evaluation --&gt; f1: 0.6084\nTime elapsed: 4.533s\n-------------------------------------------------\nTime: 4.533s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.734s\n-------------------------------------\nGaussianNB         --&gt; f1: 0.6063\nLogisticRegression --&gt; f1: 0.6162 !\nRandomForest       --&gt; f1: 0.6084 ~\n</pre> In\u00a0[8]: Copied! <pre>atom.plot_prc()\n</pre> atom.plot_prc() In\u00a0[9]: Copied! <pre># Based on the results on the test set, we select the best model for further tuning\natom.run(\"lr_tuned\", n_trials=10)\n</pre> # Based on the results on the test set, we select the best model for further tuning atom.run(\"lr_tuned\", n_trials=10) <pre>\nTraining ========================= &gt;&gt;\nModels: LR_tuned\nMetric: f1\n\n\nRunning hyperparameter tuning for LogisticRegression...\n| trial | penalty |       C |  solver | max_iter | l1_ratio |      f1 | best_f1 | time_trial | time_ht |    state |\n| ----- | ------- | ------- | ------- | -------- | -------- | ------- | ------- | ---------- | ------- | -------- |\n| 0     |    None |  0.1893 |     sag |      540 |      0.4 |  0.6096 |  0.6096 |     0.797s |  0.797s | COMPLETE |\n| 1     |      l2 |  0.6275 | newto.. |      150 |      0.7 |  0.6101 |  0.6101 |     0.637s |  1.433s | COMPLETE |\n| 2     |      l1 |  0.7457 | libli.. |      740 |      0.7 |  0.6114 |  0.6114 |     0.815s |  2.248s | COMPLETE |\n| 3     |      l2 |  0.0759 | newto.. |      290 |      0.4 |  0.6204 |  0.6204 |     0.634s |  2.882s | COMPLETE |\n| 4     |      l2 |  0.2122 | newto.. |      730 |      0.9 |  0.6273 |  0.6273 |     0.635s |  3.516s | COMPLETE |\n| 5     |      l2 |  0.0017 |   lbfgs |      260 |      1.0 |   0.589 |  0.6273 |     0.581s |  4.097s | COMPLETE |\n| 6     |      l2 |  0.0137 |     sag |      130 |      0.4 |  0.6092 |  0.6273 |     0.615s |  4.711s | COMPLETE |\n| 7     |    None |  0.0014 |     sag |      640 |      0.1 |  0.5909 |  0.6273 |     0.725s |  5.436s | COMPLETE |\n| 8     |      l2 |  0.0224 |     sag |      500 |      1.0 |  0.6226 |  0.6273 |     0.653s |  6.089s | COMPLETE |\n| 9     |      l1 |  0.1594 |    saga |      630 |      0.2 |  0.6236 |  0.6273 |     0.810s |  6.898s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 4\nBest parameters:\n --&gt; penalty: l2\n --&gt; C: 0.2122\n --&gt; solver: newton-cg\n --&gt; max_iter: 730\n --&gt; l1_ratio: 0.9\nBest evaluation --&gt; f1: 0.6273\nTime elapsed: 6.898s\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6188\nTest evaluation --&gt; f1: 0.6172\nTime elapsed: 0.352s\n-------------------------------------------------\nTime: 7.251s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 7.461s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.6172\n</pre> <p>We already used the test set to choose the best model for futher tuning, so this set is no longer truly independent. Although it may not be directly visible in the results, using the test set now to evaluate the tuned LR model would be a mistake, since it carries a bias. For this reason, we have set apart an extra, indepedent set to validate the final model: the holdout set. If we are not going to use the test set for validation, we might as well use it to train the model and so optimize the use of the available data. Use the full_train method for this.</p> In\u00a0[10]: Copied! <pre># Re-train the model on the full dataset (train + test) \natom.lr_tuned.full_train()\n</pre> # Re-train the model on the full dataset (train + test)  atom.lr_tuned.full_train() <pre>Fit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6185\nTest evaluation --&gt; f1: 0.6185\nTime elapsed: 0.717s\n</pre> In\u00a0[11]: Copied! <pre># Evaluate on the holdout set\natom.lr_tuned.evaluate(rows=\"holdout\")\n</pre> # Evaluate on the holdout set atom.lr_tuned.evaluate(rows=\"holdout\") Out[11]: <pre>accuracy     0.8577\nap           0.7473\nba           0.7480\nf1           0.6352\njaccard      0.4654\nmcc          0.5606\nprecision    0.7559\nrecall       0.5477\nauc          0.8873\nName: LR_tuned, dtype: float64</pre> In\u00a0[13]: Copied! <pre>atom.lr_tuned.plot_prc(rows=\"holdout\", legend=\"upper right\")\n</pre> atom.lr_tuned.plot_prc(rows=\"holdout\", legend=\"upper right\")"}, {"location": "examples/holdout_set/#example-holdout-set", "title": "Example: Holdout set\u00b6", "text": "<p>This example shows when and how to use ATOM's holdout set in an exploration pipeline.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/holdout_set/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/holdout_set/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/holdout_set/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/hyperparameter_tuning/", "title": "Hyperparameter tuning", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom optuna.distributions import IntDistribution\nfrom atom import ATOMClassifier\n</pre> # Import packages from sklearn.datasets import load_breast_cancer from optuna.distributions import IntDistribution from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX, y = load_breast_cancer(return_X_y=True)\n</pre> # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied! <pre># Initialize atom\natom = ATOMClassifier(X, y, n_jobs=4, verbose=2, random_state=1)\n</pre> # Initialize atom atom = ATOMClassifier(X, y, n_jobs=4, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\nParallel processing with 4 cores.\nParallelization backend: loky\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n</pre> In\u00a0[4]: Copied! <pre># Train a MultiLayerPerceptron model on two metrics\n# using a custom number of hidden layers\natom.run(\n    models=\"MLP\",\n    metric=[\"f1\", \"ap\"],\n    n_trials=10,\n    est_params={\"activation\": \"relu\"},\n    ht_params={\n        \"distributions\": {\n            \"hidden_layer_1\": IntDistribution(2, 4),\n            \"hidden_layer_2\": IntDistribution(10, 20),\n            \"hidden_layer_3\": IntDistribution(10, 20),\n            \"hidden_layer_4\": IntDistribution(2, 4),\n        }\n    }\n)\n</pre> # Train a MultiLayerPerceptron model on two metrics # using a custom number of hidden layers atom.run(     models=\"MLP\",     metric=[\"f1\", \"ap\"],     n_trials=10,     est_params={\"activation\": \"relu\"},     ht_params={         \"distributions\": {             \"hidden_layer_1\": IntDistribution(2, 4),             \"hidden_layer_2\": IntDistribution(10, 20),             \"hidden_layer_3\": IntDistribution(10, 20),             \"hidden_layer_4\": IntDistribution(2, 4),         }     } ) <pre>\nTraining ========================= &gt;&gt;\nModels: MLP\nMetric: f1, ap\n\n\nRunning hyperparameter tuning for MultiLayerPerceptron...\n| trial | hidden_layer_1 | hidden_layer_2 | hidden_layer_3 | hidden_layer_4 |      f1 | best_f1 |      ap | best_ap | time_trial | time_ht |    state |\n| ----- | -------------- | -------------- | -------------- | -------------- | ------- | ------- | ------- | ------- | ---------- | ------- | -------- |\n| 0     |              3 |             17 |             10 |              2 |  0.9464 |  0.9464 |  0.9844 |  0.9844 |     9.139s |  9.139s | COMPLETE |\n| 1     |              2 |             11 |             12 |              3 |  0.9744 |  0.9744 |  0.9991 |  0.9991 |    11.466s | 20.605s | COMPLETE |\n| 2     |              3 |             15 |             14 |              4 |  0.9915 |  0.9915 |  0.9978 |  0.9991 |     8.570s | 29.175s | COMPLETE |\n| 3     |              2 |             19 |             10 |              4 |  0.9655 |  0.9915 |  0.9878 |  0.9991 |     9.208s | 38.383s | COMPLETE |\n| 4     |              3 |             16 |             11 |              2 |  0.9661 |  0.9915 |  0.9981 |  0.9991 |     0.657s | 39.039s | COMPLETE |\n| 5     |              4 |             20 |             13 |              4 |  0.9739 |  0.9915 |  0.9989 |  0.9991 |     0.623s | 39.662s | COMPLETE |\n| 6     |              4 |             19 |             10 |              2 |  0.9828 |  0.9915 |  0.9907 |  0.9991 |     0.601s | 40.263s | COMPLETE |\n| 7     |              2 |             19 |             11 |              3 |  0.7733 |  0.9915 |  0.9997 |  0.9997 |     0.601s | 40.863s | COMPLETE |\n| 8     |              4 |             15 |             17 |              2 |  0.9915 |  0.9915 |  0.9997 |  0.9997 |     0.601s | 41.464s | COMPLETE |\n| 9     |              4 |             19 |             10 |              4 |  0.9828 |  0.9915 |  0.9822 |  0.9997 |     0.599s | 42.062s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 8\nBest parameters:\n --&gt; hidden_layer_sizes: (4, 15, 17, 2)\nBest evaluation --&gt; f1: 0.9915   ap: 0.9997\nTime elapsed: 42.062s\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9965   ap: 0.9991\nTest evaluation --&gt; f1: 0.9718   ap: 0.9938\nTime elapsed: 1.515s\n-------------------------------------------------\nTime: 43.578s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 43.815s\n-------------------------------------\nMultiLayerPerceptron --&gt; f1: 0.9718   ap: 0.9938\n</pre> In\u00a0[5]: Copied! <pre># For multi-metric runs, the selected best trial is the first in the Pareto front\natom.mlp.best_trial\n</pre> # For multi-metric runs, the selected best trial is the first in the Pareto front atom.mlp.best_trial Out[5]: <pre>FrozenTrial(number=8, state=1, values=[0.9914529914529915, 0.9997077732320282], datetime_start=datetime.datetime(2023, 11, 4, 19, 13, 50, 113304), datetime_complete=datetime.datetime(2023, 11, 4, 19, 13, 50, 713850), params={'hidden_layer_1': 4, 'hidden_layer_2': 15, 'hidden_layer_3': 17, 'hidden_layer_4': 2}, user_attrs={'estimator': MLPClassifier(hidden_layer_sizes=(4, 15, 17, 2), random_state=1)}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'hidden_layer_1': IntDistribution(high=4, log=False, low=2, step=1), 'hidden_layer_2': IntDistribution(high=20, log=False, low=10, step=1), 'hidden_layer_3': IntDistribution(high=20, log=False, low=10, step=1), 'hidden_layer_4': IntDistribution(high=4, log=False, low=2, step=1)}, trial_id=8, value=None)</pre> In\u00a0[6]: Copied! <pre>atom.plot_pareto_front()\n</pre> atom.plot_pareto_front() In\u00a0[7]: Copied! <pre># If you are unhappy with the results, it's possible to conitnue the study\natom.mlp.hyperparameter_tuning(n_trials=5)\n</pre> # If you are unhappy with the results, it's possible to conitnue the study atom.mlp.hyperparameter_tuning(n_trials=5) <pre>Running hyperparameter tuning for MultiLayerPerceptron...\n| trial | hidden_layer_1 | hidden_layer_2 | hidden_layer_3 | hidden_layer_4 |      f1 | best_f1 |      ap | best_ap | time_trial | time_ht |    state |\n| ----- | -------------- | -------------- | -------------- | -------------- | ------- | ------- | ------- | ------- | ---------- | ------- | -------- |\n| 10    |              4 |             18 |             13 |              4 |  0.9831 |  0.9915 |  0.9997 |  0.9997 |     0.673s | 42.735s | COMPLETE |\n| 11    |              2 |             14 |             19 |              2 |  0.9421 |  0.9915 |  0.9899 |  0.9997 |     0.604s | 43.339s | COMPLETE |\n| 12    |              2 |             11 |             10 |              4 |  0.7733 |  0.9915 |    0.99 |  0.9997 |     0.617s | 43.955s | COMPLETE |\n| 13    |              2 |             12 |             15 |              2 |  0.9558 |  0.9915 |  0.9985 |  0.9997 |     0.595s | 44.550s | COMPLETE |\n| 14    |              3 |             11 |             16 |              4 |  0.7733 |  0.9915 |  0.9721 |  0.9997 |     0.663s | 45.212s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 8\nBest parameters:\n --&gt; hidden_layer_sizes: (4, 15, 17, 2)\nBest evaluation --&gt; f1: 0.9915   ap: 0.9997\nTime elapsed: 45.212s\n</pre> In\u00a0[8]: Copied! <pre># The trials attribute gives an overview of the trial results\natom.mlp.trials\n</pre> # The trials attribute gives an overview of the trial results atom.mlp.trials Out[8]: hidden_layer_1 hidden_layer_2 hidden_layer_3 hidden_layer_4 estimator f1 best_f1 ap best_ap time_trial time_ht state trial 0 3 17 10 2 MLPClassifier(hidden_layer_sizes=(3, 17, 10, 2... 0.946429 0.991453 0.984402 0.999708 9.138911 9.138911 COMPLETE 1 2 11 12 3 MLPClassifier(hidden_layer_sizes=(2, 11, 12, 3... 0.974359 0.991453 0.999128 0.999708 11.466475 20.605386 COMPLETE 2 3 15 14 4 MLPClassifier(hidden_layer_sizes=(3, 15, 14, 4... 0.991453 0.991453 0.997842 0.999708 8.569545 29.174931 COMPLETE 3 2 19 10 4 MLPClassifier(hidden_layer_sizes=(2, 19, 10, 4... 0.965517 0.991453 0.987805 0.999708 9.207920 38.382851 COMPLETE 4 3 16 11 2 MLPClassifier(hidden_layer_sizes=(3, 16, 11, 2... 0.966102 0.991453 0.998086 0.999708 0.656597 39.039448 COMPLETE 5 4 20 13 4 MLPClassifier(hidden_layer_sizes=(4, 20, 13, 4... 0.973913 0.991453 0.998855 0.999708 0.622566 39.662014 COMPLETE 6 4 19 10 2 MLPClassifier(hidden_layer_sizes=(4, 19, 10, 2... 0.982759 0.991453 0.990748 0.999708 0.600547 40.262561 COMPLETE 7 2 19 11 3 MLPClassifier(hidden_layer_sizes=(2, 19, 11, 3... 0.773333 0.991453 0.999708 0.999708 0.600546 40.863107 COMPLETE 8 4 15 17 2 MLPClassifier(hidden_layer_sizes=(4, 15, 17, 2... 0.991453 0.991453 0.999708 0.999708 0.600546 41.463653 COMPLETE 9 4 19 10 4 MLPClassifier(hidden_layer_sizes=(4, 19, 10, 4... 0.982759 0.991453 0.982168 0.999708 0.598815 42.062468 COMPLETE 10 4 18 13 4 MLPClassifier(hidden_layer_sizes=(4, 18, 13, 4... 0.983051 0.991453 0.999708 0.999708 0.672611 42.735079 COMPLETE 11 2 14 19 2 MLPClassifier(hidden_layer_sizes=(2, 14, 19, 2... 0.942149 0.991453 0.989914 0.999708 0.603549 43.338628 COMPLETE 12 2 11 10 4 MLPClassifier(hidden_layer_sizes=(2, 11, 10, 4... 0.773333 0.991453 0.990024 0.999708 0.616561 43.955189 COMPLETE 13 2 12 15 2 MLPClassifier(hidden_layer_sizes=(2, 12, 15, 2... 0.955752 0.991453 0.998518 0.999708 0.594541 44.549730 COMPLETE 14 3 11 16 4 MLPClassifier(hidden_layer_sizes=(3, 11, 16, 4... 0.773333 0.991453 0.972070 0.999708 0.662602 45.212332 COMPLETE In\u00a0[9]: Copied! <pre># Select a custom best trial...\natom.mlp.best_trial = 2\n\n# ...and check that the best parameters are now those in the selected trial\natom.mlp.best_params\n</pre> # Select a custom best trial... atom.mlp.best_trial = 2  # ...and check that the best parameters are now those in the selected trial atom.mlp.best_params Out[9]: <pre>{'hidden_layer_sizes': (3, 15, 14, 4)}</pre> In\u00a0[10]: Copied! <pre># Lastly, fit the model on the complete training set \n# using the new combination of hyperparameters\natom.mlp.fit()\n</pre> # Lastly, fit the model on the complete training set  # using the new combination of hyperparameters atom.mlp.fit() <pre>Fit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.9983   ap: 0.9998\nTest evaluation --&gt; f1: 0.9718   ap: 0.9947\nTime elapsed: 3.048s\n</pre> In\u00a0[11]: Copied! <pre>atom.plot_trials()\n</pre> atom.plot_trials() In\u00a0[12]: Copied! <pre>atom.plot_parallel_coordinate()\n</pre> atom.plot_parallel_coordinate()"}, {"location": "examples/hyperparameter_tuning/#example-hyperparameter-tuning", "title": "Example: Hyperparameter tuning\u00b6", "text": "<p>This example shows an advanced example on how to optimize your model's hyperparameters for multi-metric runs.</p> <p>Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.</p>"}, {"location": "examples/hyperparameter_tuning/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/hyperparameter_tuning/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/hyperparameter_tuning/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/", "title": "Imbalanced datasets", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import make_classification\n</pre> # Import packages from atom import ATOMClassifier from sklearn.datasets import make_classification In\u00a0[2]: Copied! <pre># Create a mock imbalanced dataset\nX, y = make_classification(\n    n_samples=5000,\n    n_features=30,\n    n_informative=20,\n    weights=(0.95,),\n    random_state=1,\n)\n</pre> # Create a mock imbalanced dataset X, y = make_classification(     n_samples=5000,     n_features=30,     n_informative=20,     weights=(0.95,),     random_state=1, ) In\u00a0[3]: Copied! <pre># Initialize atom\natom = ATOMClassifier(X, y, test_size=0.2, verbose=2, random_state=1)\n</pre> # Initialize atom atom = ATOMClassifier(X, y, test_size=0.2, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (5000, 31)\nTrain set size: 4000\nTest set size: 1000\n-------------------------------------\nMemory: 1.24 MB\nScaled: False\nOutlier values: 570 (0.5%)\n\n</pre> In\u00a0[4]: Copied! <pre># Let's have a look at the data. Note that, since the input wasn't\n# a dataframe, atom has given default names to the columns.\natom.head()\n</pre> # Let's have a look at the data. Note that, since the input wasn't # a dataframe, atom has given default names to the columns. atom.head() Out[4]: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 ... x21 x22 x23 x24 x25 x26 x27 x28 x29 target 0 -0.535760 -2.426045 1.256836 0.374501 -3.241958 -1.239468 -0.208750 -6.015995 3.698669 0.112512 ... 0.044302 -1.935727 10.870353 0.286755 -2.416507 0.556990 -1.522635 3.719201 1.449135 0 1 -3.311935 -3.149920 -0.801252 -2.644414 -0.704889 -3.312256 0.714515 2.992345 5.056910 3.036775 ... 2.224359 0.451273 -1.822108 -1.435801 0.036132 -1.364583 1.215663 5.232161 1.408798 0 2 3.821199 1.328129 -1.000720 -13.151697 0.254253 1.263636 -1.088451 4.924264 -1.225646 -6.974824 ... 3.541222 1.686667 -13.763703 -1.321256 1.677687 0.774966 -5.067689 4.663386 -1.714186 0 3 5.931126 3.338830 0.545906 2.296355 -3.941088 3.527252 -0.158770 3.138381 -0.927460 -1.642079 ... -3.634442 7.853176 -8.457598 0.000490 -2.612756 -1.138206 0.497150 4.351289 -0.321748 0 4 -2.829472 -1.227185 -0.751892 3.056106 -1.988920 -2.219184 -0.075882 5.790102 -2.786671 2.023458 ... 4.057954 1.178564 -15.028187 1.627140 -1.093587 -0.422655 1.777011 6.660638 -2.553723 0 <p>5 rows \u00d7 31 columns</p> In\u00a0[6]: Copied! <pre># Let's start reducing the number of features\natom.feature_selection(\"rfe\", solver=\"rf\", n_features=12)\n</pre> # Let's start reducing the number of features atom.feature_selection(\"rfe\", solver=\"rf\", n_features=12) <pre>Fitting FeatureSelector...\nPerforming feature selection ...\n --&gt; rfe selected 12 features from the dataset.\n   --&gt; Dropping feature x1 (rank 8).\n   --&gt; Dropping feature x2 (rank 11).\n   --&gt; Dropping feature x4 (rank 3).\n   --&gt; Dropping feature x6 (rank 16).\n   --&gt; Dropping feature x7 (rank 14).\n   --&gt; Dropping feature x10 (rank 19).\n   --&gt; Dropping feature x12 (rank 13).\n   --&gt; Dropping feature x13 (rank 12).\n   --&gt; Dropping feature x14 (rank 9).\n   --&gt; Dropping feature x16 (rank 10).\n   --&gt; Dropping feature x18 (rank 17).\n   --&gt; Dropping feature x19 (rank 2).\n   --&gt; Dropping feature x20 (rank 4).\n   --&gt; Dropping feature x22 (rank 7).\n   --&gt; Dropping feature x23 (rank 5).\n   --&gt; Dropping feature x24 (rank 18).\n   --&gt; Dropping feature x25 (rank 6).\n   --&gt; Dropping feature x26 (rank 15).\n</pre> In\u00a0[7]: Copied! <pre># Fit a model directly on the imbalanced data\natom.run(\"RF\", metric=\"ba\")\n</pre> # Fit a model directly on the imbalanced data atom.run(\"RF\", metric=\"ba\") <pre>\nTraining ========================= &gt;&gt;\nModels: RF\nMetric: ba\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; ba: 1.0\nTest evaluation --&gt; ba: 0.5556\nTime elapsed: 2.497s\n-------------------------------------------------\nTime: 2.497s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 2.568s\n-------------------------------------\nRandomForest --&gt; ba: 0.5556 ~\n</pre> In\u00a0[8]: Copied! <pre># The transformer and the models have been added to the branch\natom.branch\n</pre> # The transformer and the models have been added to the branch atom.branch Out[8]: <pre>Branch(main)</pre> In\u00a0[9]: Copied! <pre># Create a new branch for oversampling\natom.branch = \"oversample\"\n</pre> # Create a new branch for oversampling atom.branch = \"oversample\" <pre>Successfully created new branch: oversample.\n</pre> In\u00a0[10]: Copied! <pre># Perform oversampling of the minority class\natom.balance(strategy=\"smote\")\n</pre> # Perform oversampling of the minority class atom.balance(strategy=\"smote\") <pre>Oversampling with SMOTE...\n --&gt; Adding 3570 samples to class 1.\n</pre> In\u00a0[11]: Copied! <pre>atom.classes  # Check the balanced training set!\n</pre> atom.classes  # Check the balanced training set! Out[11]: dataset train test 0 4731 3785 946 1 3839 3785 54 In\u00a0[12]: Copied! <pre># Train another model on the new branch. Add a tag after \n# the model's acronym to distinguish it from the first model\natom.run(\"rf_os\")  # os for oversample\n</pre> # Train another model on the new branch. Add a tag after  # the model's acronym to distinguish it from the first model atom.run(\"rf_os\")  # os for oversample <pre>\nTraining ========================= &gt;&gt;\nModels: RF_os\nMetric: ba\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; ba: 1.0\nTest evaluation --&gt; ba: 0.7672\nTime elapsed: 4.136s\n-------------------------------------------------\nTime: 4.136s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 4.248s\n-------------------------------------\nRandomForest --&gt; ba: 0.7672 ~\n</pre> In\u00a0[14]: Copied! <pre># Create the undersampling branch\n# Split from master to not adopt the oversmapling transformer\natom.branch = \"undersample_from_main\"\n</pre> # Create the undersampling branch # Split from master to not adopt the oversmapling transformer atom.branch = \"undersample_from_main\" <pre>Successfully created new branch: undersample.\n</pre> In\u00a0[15]: Copied! <pre>atom.classes  # In this branch, the data is still imbalanced\n</pre> atom.classes  # In this branch, the data is still imbalanced Out[15]: dataset train test 0 4731 3785 946 1 269 215 54 In\u00a0[16]: Copied! <pre># Perform undersampling of the majority class\natom.balance(strategy=\"NearMiss\")\n</pre> # Perform undersampling of the majority class atom.balance(strategy=\"NearMiss\") <pre>Undersampling with NearMiss...\n --&gt; Removing 3570 samples from class 0.\n</pre> In\u00a0[17]: Copied! <pre>atom.run(\"rf_us\")\n</pre> atom.run(\"rf_us\") <pre>\nTraining ========================= &gt;&gt;\nModels: RF_us\nMetric: ba\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; ba: 1.0\nTest evaluation --&gt; ba: 0.6706\nTime elapsed: 0.285s\n-------------------------------------------------\nTime: 0.285s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.321s\n-------------------------------------\nRandomForest --&gt; ba: 0.6706 ~\n</pre> In\u00a0[18]: Copied! <pre># Check that the branch only contains the desired transformers \natom.branch\n</pre> # Check that the branch only contains the desired transformers  atom.branch Out[18]: <pre>Branch(undersample)</pre> In\u00a0[19]: Copied! <pre># Visualize the complete pipeline\natom.plot_pipeline()\n</pre> # Visualize the complete pipeline atom.plot_pipeline() In\u00a0[20]: Copied! <pre>atom.evaluate()\n</pre> atom.evaluate() Out[20]: accuracy ap ba f1 jaccard mcc precision recall auc RF 0.952 0.6562 0.5556 0.2000 0.1111 0.3252 1.000 0.1111 0.9107 RF_os 0.956 0.6215 0.7672 0.5769 0.4054 0.5542 0.600 0.5556 0.9251 RF_us 0.509 0.3687 0.6706 0.1578 0.0857 0.1545 0.087 0.8519 0.8258 In\u00a0[21]: Copied! <pre>atom.plot_prc()\n</pre> atom.plot_prc() In\u00a0[22]: Copied! <pre>atom.plot_roc()\n</pre> atom.plot_roc()"}, {"location": "examples/imbalanced_datasets/#example-imbalanced-datasets", "title": "Example: Imbalanced datasets\u00b6", "text": "<p>This example shows how ATOM can help you handle imbalanced datasets. We will evaluate the performance of three different Random Forest models: one trained directly on the imbalanced dataset, one trained on an oversampled dataset and the last one trained on an undersampled dataset.</p>"}, {"location": "examples/imbalanced_datasets/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#oversampling", "title": "Oversampling\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#undersampling", "title": "Undersampling\u00b6", "text": ""}, {"location": "examples/imbalanced_datasets/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/in_training_validation/", "title": "In-training validation", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom atom import ATOMClassifier\n</pre> # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX, y = load_breast_cancer(return_X_y=True)\n</pre> # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied! <pre># Initialize atom\natom = ATOMClassifier(X, y, verbose=2, random_state=1)\n</pre> # Initialize atom atom = ATOMClassifier(X, y, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n</pre> In\u00a0[4]: Copied! <pre># Not all models support in-training validation\n# You can chek which ones do using the available_models method\ndf = atom.available_models()[[\"acronym\", \"model\", \"has_validation\"]]\ndf[df[\"has_validation\"]]\n</pre> # Not all models support in-training validation # You can chek which ones do using the available_models method df = atom.available_models()[[\"acronym\", \"model\", \"has_validation\"]] df[df[\"has_validation\"]] Out[4]: acronym model has_validation 3 CatB CatBoost True 15 LGB LightGBM True 19 MLP MultiLayerPerceptron True 21 PA PassiveAggressive True 22 Perc Perceptron True 27 SGD StochasticGradientDescent True 29 XGB XGBoost True In\u00a0[5]: Copied! <pre># Run the models normally\natom.run(models=[\"MLP\", \"LGB\"], metric=\"auc\")\n</pre> # Run the models normally atom.run(models=[\"MLP\", \"LGB\"], metric=\"auc\") <pre>\nTraining ========================= &gt;&gt;\nModels: MLP, LGB\nMetric: auc\n\n\nResults for MultiLayerPerceptron:\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 0.9997\nTest evaluation --&gt; auc: 0.9936\nTime elapsed: 1.821s\n-------------------------------------------------\nTime: 1.821s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; auc: 1.0\nTest evaluation --&gt; auc: 0.9775\nTime elapsed: 0.352s\n-------------------------------------------------\nTime: 0.352s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 2.236s\n-------------------------------------\nMultiLayerPerceptron --&gt; auc: 0.9936 !\nLightGBM             --&gt; auc: 0.9775\n</pre> In\u00a0[6]: Copied! <pre>atom.plot_evals(title=\"In-training validation scores\")\n</pre> atom.plot_evals(title=\"In-training validation scores\") In\u00a0[7]: Copied! <pre># Plot the validation on the train and test set\natom.lgb.plot_evals(dataset=\"train+test\", title=\"LightGBM's in-training validation\")\n</pre> # Plot the validation on the train and test set atom.lgb.plot_evals(dataset=\"train+test\", title=\"LightGBM's in-training validation\")"}, {"location": "examples/in_training_validation/#example-in-training-validation", "title": "Example: In-training validation\u00b6", "text": "<p>This example shows how to keep track of the model's performance during training.</p> <p>Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.</p>"}, {"location": "examples/in_training_validation/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/in_training_validation/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/in_training_validation/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/memory_considerations/", "title": "Memory considerations", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport os\nimport tempfile\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import os import tempfile import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre># Define a temp directory to store the files in this example\ntempdir = tempfile.gettempdir()\n</pre> # Define a temp directory to store the files in this example tempdir = tempfile.gettempdir() In\u00a0[4]: Copied! <pre>def get_size(filepath):\n    \"\"\"Return the size of the object in MB.\"\"\"\n    return f\"{os.path.getsize(filepath + '.pkl') / 1e6:.2f}MB\"\n</pre> def get_size(filepath):     \"\"\"Return the size of the object in MB.\"\"\"     return f\"{os.path.getsize(filepath + '.pkl') / 1e6:.2f}MB\" In\u00a0[5]: Copied! <pre>atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2)\n</pre> atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\n</pre> <p>Note that the datset takes ~25MB. We can reduce the size of the dataset using the shrink method, which reduces the dtypes to their smallest possible value.</p> In\u00a0[6]: Copied! <pre>atom.dtypes\n</pre> atom.dtypes Out[6]: <pre>Location          object\nMinTemp          float64\nMaxTemp          float64\nRainfall         float64\nEvaporation      float64\nSunshine         float64\nWindGustDir       object\nWindGustSpeed    float64\nWindDir9am        object\nWindDir3pm        object\nWindSpeed9am     float64\nWindSpeed3pm     float64\nHumidity9am      float64\nHumidity3pm      float64\nPressure9am      float64\nPressure3pm      float64\nCloud9am         float64\nCloud3pm         float64\nTemp9am          float64\nTemp3pm          float64\nRainToday         object\nRainTomorrow       int64\ndtype: object</pre> In\u00a0[7]: Copied! <pre>atom.shrink(str2cat=True)\n</pre> atom.shrink(str2cat=True) <pre>The column dtypes are successfully converted.\n</pre> In\u00a0[8]: Copied! <pre>atom.dtypes\n</pre> atom.dtypes Out[8]: <pre>Location         category\nMinTemp           Float32\nMaxTemp           Float32\nRainfall          Float32\nEvaporation       Float32\nSunshine          Float32\nWindGustDir      category\nWindGustSpeed       Int16\nWindDir9am       category\nWindDir3pm       category\nWindSpeed9am        Int16\nWindSpeed3pm         Int8\nHumidity9am          Int8\nHumidity3pm          Int8\nPressure9am       Float32\nPressure3pm       Float32\nCloud9am             Int8\nCloud3pm             Int8\nTemp9am           Float32\nTemp3pm           Float32\nRainToday        category\nRainTomorrow         Int8\ndtype: object</pre> In\u00a0[9]: Copied! <pre># Let's check the memory usage again...\n# Notice the huge drop!\natom.stats()\n</pre> # Let's check the memory usage again... # Notice the huge drop! atom.stats() <pre>Dataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 9.67 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n</pre> In\u00a0[10]: Copied! <pre># Now, we create some new branches to train models with different trasnformers\natom.impute()\natom.encode()\natom.run(\"LDA\")\n\natom.branch = \"b2\"\natom.scale()\natom.run(\"LDA_scaled\")\n\natom.branch = \"b3_from_main\"\natom.normalize()\natom.run(\"LDA_norm\")\n</pre> # Now, we create some new branches to train models with different trasnformers atom.impute() atom.encode() atom.run(\"LDA\")  atom.branch = \"b2\" atom.scale() atom.run(\"LDA_scaled\")  atom.branch = \"b3_from_main\" atom.normalize() atom.run(\"LDA_norm\") <pre>Fitting Imputer...\nImputing missing values...\n --&gt; Dropping 637 samples due to missing values in feature MinTemp.\n --&gt; Dropping 322 samples due to missing values in feature MaxTemp.\n --&gt; Dropping 1406 samples due to missing values in feature Rainfall.\n --&gt; Dropping 60843 samples due to missing values in feature Evaporation.\n --&gt; Dropping 67816 samples due to missing values in feature Sunshine.\n --&gt; Dropping 9330 samples due to missing values in feature WindGustDir.\n --&gt; Dropping 9270 samples due to missing values in feature WindGustSpeed.\n --&gt; Dropping 10013 samples due to missing values in feature WindDir9am.\n --&gt; Dropping 3778 samples due to missing values in feature WindDir3pm.\n --&gt; Dropping 1348 samples due to missing values in feature WindSpeed9am.\n --&gt; Dropping 2630 samples due to missing values in feature WindSpeed3pm.\n --&gt; Dropping 1774 samples due to missing values in feature Humidity9am.\n --&gt; Dropping 3610 samples due to missing values in feature Humidity3pm.\n --&gt; Dropping 14014 samples due to missing values in feature Pressure9am.\n --&gt; Dropping 13981 samples due to missing values in feature Pressure3pm.\n --&gt; Dropping 53657 samples due to missing values in feature Cloud9am.\n --&gt; Dropping 57094 samples due to missing values in feature Cloud3pm.\n --&gt; Dropping 904 samples due to missing values in feature Temp9am.\n --&gt; Dropping 2726 samples due to missing values in feature Temp3pm.\n --&gt; Dropping 1406 samples due to missing values in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 26 classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n\nTraining ========================= &gt;&gt;\nModels: LDA\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6213\nTest evaluation --&gt; f1: 0.6341\nTime elapsed: 0.375s\n-------------------------------------------------\nTime: 0.375s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.613s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.6341\nSuccessfully created new branch: b2.\nFitting Scaler...\nScaling features...\n\nTraining ========================= &gt;&gt;\nModels: LDA_scaled\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6213\nTest evaluation --&gt; f1: 0.6341\nTime elapsed: 0.390s\n-------------------------------------------------\nTime: 0.390s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.626s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.6341\nSuccessfully created new branch: b3.\nFitting Normalizer...\nNormalizing features...\n\nTraining ========================= &gt;&gt;\nModels: LDA_norm\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6267\nTest evaluation --&gt; f1: 0.6368\nTime elapsed: 0.369s\n-------------------------------------------------\nTime: 0.369s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.626s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.6368\n</pre> In\u00a0[11]: Copied! <pre># If we save atom now, notice the size\n# This is because atom keeps a copy of every branch in memory\nfilename = tempdir + \"atom1\"\natom.save(filename)\nget_size(filename)\n</pre> # If we save atom now, notice the size # This is because atom keeps a copy of every branch in memory filename = tempdir + \"atom1\" atom.save(filename) get_size(filename) <pre>ATOMClassifier successfully saved.\n</pre> Out[11]: <pre>'34.92MB'</pre> <p>To avoid large memory usages, set the <code>memory</code> parameter.</p> In\u00a0[12]: Copied! <pre>atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1)\natom.shrink(str2cat=True)\natom.impute()\natom.encode()\natom.run(\"LDA\")\n\natom.branch = \"b2\"\natom.scale()\natom.run(\"LDA_scaled\")\n\natom.branch = \"b3_from_main\"\natom.normalize()\natom.run(\"LDA_norm\")\n</pre> atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1) atom.shrink(str2cat=True) atom.impute() atom.encode() atom.run(\"LDA\")  atom.branch = \"b2\" atom.scale() atom.run(\"LDA_scaled\")  atom.branch = \"b3_from_main\" atom.normalize() atom.run(\"LDA_norm\") <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\nCache storage: C:\\Users\\Mavs\\AppData\\Local\\Temp\\joblib\n\nDataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nThe column dtypes are successfully converted.\nFitting Imputer...\nImputing missing values...\nFitting Encoder...\nEncoding categorical columns...\n\nTraining ========================= &gt;&gt;\nModels: LDA\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6233\nTest evaluation --&gt; f1: 0.6248\nTime elapsed: 0.445s\n-------------------------------------------------\nTime: 0.445s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.708s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.6248\nSuccessfully created new branch: b2.\nFitting Scaler...\nScaling features...\n\nTraining ========================= &gt;&gt;\nModels: LDA_scaled\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6233\nTest evaluation --&gt; f1: 0.6248\nTime elapsed: 0.454s\n-------------------------------------------------\nTime: 0.454s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.737s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.6248\nSuccessfully created new branch: b3.\nFitting Normalizer...\nNormalizing features...\n\nTraining ========================= &gt;&gt;\nModels: LDA_norm\nMetric: f1\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.6279\nTest evaluation --&gt; f1: 0.6298\nTime elapsed: 0.447s\n-------------------------------------------------\nTime: 0.447s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.740s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; f1: 0.6298\n</pre> In\u00a0[13]: Copied! <pre># And now, it only takes a fraction of the previous size\n# This is because the data of inactive branches is now stored locally\nfilename = tempdir + \"atom2\"\natom.save(filename)\nget_size(filename)\n</pre> # And now, it only takes a fraction of the previous size # This is because the data of inactive branches is now stored locally filename = tempdir + \"atom2\" atom.save(filename) get_size(filename) <pre>ATOMClassifier successfully saved.\n</pre> Out[13]: <pre>'9.63MB'</pre> <p>Additionnaly, repeated calls to the same transformers with the same data will use the cached results. Don't forget to specify the <code>random_state</code> parameter to ensure the data remains the exact same.</p> In\u00a0[14]: Copied! <pre>atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1)\natom.shrink(str2cat=True)\n</pre> atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1) atom.shrink(str2cat=True) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\nCache storage: C:\\Users\\Mavs\\AppData\\Local\\Temp\\joblib\n\nDataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nThe column dtypes are successfully converted.\n</pre> In\u00a0[15]: Copied! <pre># Note the transformers are no longer fitted,\n# instead the results are immediately read from cache\natom.impute()\natom.encode()\n</pre> # Note the transformers are no longer fitted, # instead the results are immediately read from cache atom.impute() atom.encode() <pre>Retrieving cached results for Imputer...\nRetrieving cached results for Encoder...\nEncoding categorical columns...\n</pre> In\u00a0[16]: Copied! <pre>atom.dataset\n</pre> atom.dataset Out[16]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 0.075703 13.0 30.5 0.0 6.8 10.0 0.271668 59 0.312069 0.273733 ... 19 8 1013.599976 1008.0 0 2 19.6 29.9 0.0 0 1 0.245394 15.3 22.4 16.0 4.2 3.3 0.204934 39 0.236475 0.199626 ... 83 63 1025.5 1023.599976 6 6 16.9 21.1 1.0 1 2 0.262397 27.9 34.5 0.0 9.0 7.9 0.1737 72 0.236475 0.306935 ... 72 63 1009.0 1005.5 7 7 31.0 33.099998 0.0 1 3 0.239174 12.9 27.9 0.0 5.4 8.6 0.269421 39 0.256213 0.286159 ... 69 56 1023.400024 1019.799988 7 7 14.7 23.4 0.0 0 4 0.253089 7.4 14.3 0.8 2.8 4.0 0.210095 31 0.269333 0.167808 ... 84 62 1023.599976 1023.200012 4 7 9.0 13.6 0.0 1 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 56415 0.295559 23.9 28.1 0.0 2.6 7.7 0.241448 44 0.279553 0.259391 ... 86 79 1015.900024 1013.900024 7 7 25.799999 27.5 0.0 0 56416 0.217037 13.6 24.6 0.0 4.4 7.8 0.1737 39 0.193908 0.197102 ... 87 61 1023.200012 1022.599976 7 3 17.299999 21.4 0.0 0 56417 0.112176 16.299999 38.700001 0.0 10.2 13.4 0.1737 24 0.149795 0.168702 ... 29 8 1013.5 1010.299988 5 2 26.4 36.900002 0.0 0 56418 0.295559 11.5 19.200001 0.8 2.0 7.0 0.147458 22 0.13795 0.195807 ... 73 52 1021.299988 1018.799988 3 4 17.1 18.4 0.0 0 56419 0.403054 5.9 18.0 0.4 0.8 6.7 0.269421 26 0.312069 0.286159 ... 92 65 1028.0 1025.300049 3 2 9.4 16.6 0.0 0 <p>56420 rows \u00d7 22 columns</p>"}, {"location": "examples/memory_considerations/#example-memory-considerations", "title": "Example: Memory considerations\u00b6", "text": "<p>This example shows how to use the <code>memory</code> parameter to make efficient use of the available memory.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/memory_considerations/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/memory_considerations/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multi_metric/", "title": "Multi-metric runs", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMRegressor\n</pre> # Import packages import pandas as pd from atom import ATOMRegressor In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\")  # Let's have a look X.head() Out[2]: Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 In\u00a0[3]: Copied! <pre>atom = ATOMRegressor(X, n_jobs=1, verbose=2, random_state=1)\n</pre> atom = ATOMRegressor(X, n_jobs=1, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Regression.\n\nDataset stats ==================== &gt;&gt;\nShape: (4177, 9)\nTrain set size: 3342\nTest set size: 835\n-------------------------------------\nMemory: 300.88 kB\nScaled: False\nCategorical features: 1 (12.5%)\nOutlier values: 189 (0.6%)\n\n</pre> In\u00a0[4]: Copied! <pre>atom.encode()\n</pre> atom.encode() <pre>Fitting Encoder...\nEncoding categorical columns...\n --&gt; OneHot-encoding feature Sex. Contains 3 classes.\n</pre> In\u00a0[5]: Copied! <pre># For every step of the BO, both metrics are calculated,\n# but only the first is used for optimization!\natom.run(\n    models=[\"lsvm\", \"hGBM\"],\n    metric=(\"r2\", \"rmse\"),\n    n_trials=10,\n    n_bootstrap=6,\n)\n</pre> # For every step of the BO, both metrics are calculated, # but only the first is used for optimization! atom.run(     models=[\"lsvm\", \"hGBM\"],     metric=(\"r2\", \"rmse\"),     n_trials=10,     n_bootstrap=6, ) <pre>\nTraining ========================= &gt;&gt;\nModels: lSVM, hGBM\nMetric: r2, rmse\n\n\nRunning hyperparameter tuning for LinearSVM...\n| trial |                    loss |       C |    dual |      r2 | best_r2 |    rmse | best_rmse | time_trial | time_ht |    state |\n| ----- | ----------------------- | ------- | ------- | ------- | ------- | ------- | --------- | ---------- | ------- | -------- |\n| 0     | squared_epsilon_insen.. |   0.001 |    True |  0.2887 |  0.2887 | -2.6528 |   -2.6528 |     0.043s |  0.043s | COMPLETE |\n| 1     | squared_epsilon_insen.. |  0.0534 |   False |  0.3862 |  0.3862 | -2.5926 |   -2.5926 |     0.043s |  0.086s | COMPLETE |\n| 2     | squared_epsilon_insen.. |  0.0105 |    True |   0.433 |   0.433 | -2.4084 |   -2.4084 |     0.054s |  0.140s | COMPLETE |\n| 3     |     epsilon_insensitive |  0.6215 |    True |  0.4022 |   0.433 | -2.5251 |   -2.4084 |     0.045s |  0.185s | COMPLETE |\n| 4     | squared_epsilon_insen.. |  0.0369 |   False |  0.4057 |   0.433 | -2.5477 |   -2.4084 |     0.040s |  0.225s | COMPLETE |\n| 5     |     epsilon_insensitive |  0.0016 |    True | -1.5344 |   0.433 | -5.0102 |   -2.4084 |     0.035s |  0.260s | COMPLETE |\n| 6     | squared_epsilon_insen.. | 61.5811 |   False |  0.4354 |  0.4354 | -2.3845 |   -2.3845 |     0.034s |  0.294s | COMPLETE |\n| 7     | squared_epsilon_insen.. |  14.898 |   False |  0.4925 |  0.4925 | -2.2628 |   -2.2628 |     0.035s |  0.329s | COMPLETE |\n| 8     |     epsilon_insensitive |  0.0252 |    True |  0.3695 |  0.4925 | -2.6178 |   -2.2628 |     0.035s |  0.364s | COMPLETE |\n| 9     | squared_epsilon_insen.. |  0.0294 |    True |  0.4767 |  0.4925 | -2.3896 |   -2.2628 |     0.044s |  0.408s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 7\nBest parameters:\n --&gt; loss: squared_epsilon_insensitive\n --&gt; C: 14.898\n --&gt; dual: False\nBest evaluation --&gt; r2: 0.4925   rmse: -2.2628\nTime elapsed: 0.408s\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.4592   rmse: -2.3795\nTest evaluation --&gt; r2: 0.4584   rmse: -2.3369\nTime elapsed: 0.089s\nBootstrap ---------------------------------------\nEvaluation --&gt; r2: 0.4577 \u00b1 0.002   rmse: -2.3384 \u00b1 0.0043\nTime elapsed: 0.094s\n-------------------------------------------------\nTime: 0.592s\n\n\nRunning hyperparameter tuning for HistGradientBoosting...\n| trial |      loss | quantile | learning_rate | max_iter | max_leaf_nodes | max_depth | min_samples_leaf | l2_regularization |      r2 | best_r2 |    rmse | best_rmse | time_trial | time_ht |    state |\n| ----- | --------- | -------- | ------------- | -------- | -------------- | --------- | ---------------- | ----------------- | ------- | ------- | ------- | --------- | ---------- | ------- | -------- |\n| 0     | absolut.. |      0.1 |        0.0236 |      180 |             26 |        12 |               11 |               0.0 |  0.5373 |  0.5373 | -2.1398 |   -2.1398 |     0.968s |  0.968s | COMPLETE |\n| 1     |     gamma |      0.5 |         0.242 |      160 |             38 |         3 |               20 |               0.0 |   0.574 |   0.574 | -2.1598 |   -2.1398 |     0.160s |  1.128s | COMPLETE |\n| 2     |  quantile |      0.4 |        0.2448 |      210 |             12 |         3 |               25 |               0.3 |  0.4714 |   0.574 | -2.3253 |   -2.1398 |     0.422s |  1.550s | COMPLETE |\n| 3     |  quantile |      0.6 |         0.017 |      480 |             28 |        16 |               13 |               0.1 |  0.5712 |   0.574 | -2.1385 |   -2.1385 |     3.405s |  4.956s | COMPLETE |\n| 4     | squared.. |      1.0 |        0.2649 |       70 |             10 |        10 |               28 |               0.8 |  0.5561 |   0.574 | -2.2019 |   -2.1385 |     0.148s |  5.104s | COMPLETE |\n| 5     | squared.. |      0.1 |        0.0283 |      360 |             32 |         9 |               11 |               0.5 |  0.5464 |   0.574 | -2.1197 |   -2.1197 |     1.248s |  6.352s | COMPLETE |\n| 6     |  quantile |      0.4 |        0.1264 |      380 |             37 |        12 |               29 |               1.0 |  0.4416 |   0.574 | -2.3713 |   -2.1197 |     3.002s |  9.354s | COMPLETE |\n| 7     |     gamma |      0.6 |         0.678 |      330 |             25 |         6 |               12 |               0.8 |  0.4299 |   0.574 | -2.3984 |   -2.1197 |     0.739s | 10.092s | COMPLETE |\n| 8     | absolut.. |      0.9 |        0.0831 |      280 |             42 |        16 |               10 |               1.0 |  0.5242 |   0.574 | -2.2742 |   -2.1197 |     2.002s | 12.094s | COMPLETE |\n| 9     | absolut.. |      0.6 |        0.0373 |      300 |             40 |        13 |               17 |               0.8 |  0.5685 |   0.574 |   -2.17 |   -2.1197 |     1.859s | 13.953s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 5\nBest parameters:\n --&gt; loss: squared_error\n --&gt; quantile: 0.1\n --&gt; learning_rate: 0.0283\n --&gt; max_iter: 360\n --&gt; max_leaf_nodes: 32\n --&gt; max_depth: 9\n --&gt; min_samples_leaf: 11\n --&gt; l2_regularization: 0.5\nBest evaluation --&gt; r2: 0.5464   rmse: -2.1197\nTime elapsed: 13.953s\nFit ---------------------------------------------\nTrain evaluation --&gt; r2: 0.7959   rmse: -1.4619\nTest evaluation --&gt; r2: 0.5479   rmse: -2.1351\nTime elapsed: 1.470s\nBootstrap ---------------------------------------\nEvaluation --&gt; r2: 0.5259 \u00b1 0.0154   rmse: -2.1861 \u00b1 0.0352\nTime elapsed: 7.930s\n-------------------------------------------------\nTime: 23.353s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 25.299s\n-------------------------------------\nLinearSVM            --&gt; r2: 0.4577 \u00b1 0.002   rmse: -2.3384 \u00b1 0.0043\nHistGradientBoosting --&gt; r2: 0.5259 \u00b1 0.0154   rmse: -2.1861 \u00b1 0.0352 ~ !\n</pre> In\u00a0[6]: Copied! <pre># Check the robustness of the pipeline using cross-validation\natom.winner.cross_validate()\n</pre> # Check the robustness of the pipeline using cross-validation atom.winner.cross_validate() <pre>Applying cross-validation...\n</pre> Out[6]: train_r2 test_r2 train_rmse test_rmse time (s) 0 0.796038 0.541990 -1.453147 -2.196943 1.392266 1 0.794954 0.540424 -1.457709 -2.196179 1.436932 2 0.790722 0.505922 -1.492522 -2.153457 1.444314 3 0.785317 0.580703 -1.474827 -2.189902 1.432303 4 0.795872 0.547917 -1.461929 -2.135072 1.747591 mean 0.792581 0.543391 -1.468027 -2.174311 1.490681 std 0.004114 0.023780 0.014222 0.025330 0.129719 In\u00a0[8]: Copied! <pre># The columns in the results dataframe contain one for each metric\natom.results[[\"r2_ht\", \"r2_train\", \"r2_test\", \"rmse_ht\", \"rmse_train\", \"rmse_test\"]]\n</pre> # The columns in the results dataframe contain one for each metric atom.results[[\"r2_ht\", \"r2_train\", \"r2_test\", \"rmse_ht\", \"rmse_train\", \"rmse_test\"]] Out[8]: r2_ht r2_train r2_test rmse_ht rmse_train rmse_test lSVM 0.492530 0.4583 0.4552 -2.262754 -2.3815 -2.3439 hGBM 0.546368 0.7183 0.4971 -2.119672 -1.7173 -2.2518 In\u00a0[9]: Copied! <pre># Some plots allow us to choose the metric we want to show\nwith atom.canvas():\n    atom.plot_trials(metric=\"r2\", title=\"Hyperparameter tuning performance for R2\")\n    atom.plot_trials(metric=\"rmse\", title=\"Hyperparameter tuning performance for RMSE\")\n</pre> # Some plots allow us to choose the metric we want to show with atom.canvas():     atom.plot_trials(metric=\"r2\", title=\"Hyperparameter tuning performance for R2\")     atom.plot_trials(metric=\"rmse\", title=\"Hyperparameter tuning performance for RMSE\") In\u00a0[10]: Copied! <pre>atom.plot_results(metric=\"r2\")\n</pre> atom.plot_results(metric=\"r2\")"}, {"location": "examples/multi_metric/#example-multi-metric-runs", "title": "Example: Multi-metric runs\u00b6", "text": "<p>This example shows how to evaluate an atom's pipeline on multiple metrics.</p> <p>Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.</p>"}, {"location": "examples/multi_metric/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multi_metric/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multi_metric/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/multiclass_classification/", "title": "Multiclass classification", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom sklearn.datasets import load_wine\nfrom atom import ATOMClassifier\n</pre> # Import packages from sklearn.datasets import load_wine from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX, y = load_wine(return_X_y=True, as_frame=True)\n\n# Let's have a look\nX.head()\n</pre> # Load data X, y = load_wine(return_X_y=True, as_frame=True)  # Let's have a look X.head() Out[2]: alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue od280/od315_of_diluted_wines proline 0 14.23 1.71 2.43 15.6 127.0 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065.0 1 13.20 1.78 2.14 11.2 100.0 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050.0 2 13.16 2.36 2.67 18.6 101.0 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185.0 3 14.37 1.95 2.50 16.8 113.0 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480.0 4 13.24 2.59 2.87 21.0 118.0 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735.0 In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X, y, n_jobs=-1, verbose=2, random_state=1)\n\n# Fit the pipeline with the selected models\natom.run(\n    models=[\"LR\",\"LDA\", \"RF\"],\n    metric=\"roc_auc_ovr\",\n    n_trials=14,\n    n_bootstrap=5,\n    errors=\"raise\",\n)\n</pre> atom = ATOMClassifier(X, y, n_jobs=-1, verbose=2, random_state=1)  # Fit the pipeline with the selected models atom.run(     models=[\"LR\",\"LDA\", \"RF\"],     metric=\"roc_auc_ovr\",     n_trials=14,     n_bootstrap=5,     errors=\"raise\", ) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Multiclass classification.\nParallel processing with 16 cores.\nParallelization backend: loky\n\nDataset stats ==================== &gt;&gt;\nShape: (178, 14)\nTrain set size: 143\nTest set size: 35\n-------------------------------------\nMemory: 19.36 kB\nScaled: False\nOutlier values: 12 (0.6%)\n\n\nTraining ========================= &gt;&gt;\nModels: LR, LDA, RF\nMetric: roc_auc_ovr\n\n\nRunning hyperparameter tuning for LogisticRegression...\n| trial | penalty |       C |  solver | max_iter | l1_ratio | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |\n| ----- | ------- | ------- | ------- | -------- | -------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |      l1 |  0.0054 |    saga |      480 |      0.7 |         0.5 |              0.5 |    10.567s | 10.567s | COMPLETE |\n| 1     |      l1 |   0.122 |    saga |      380 |      0.7 |      0.9951 |           0.9951 |    11.247s | 21.814s | COMPLETE |\n| 2     |      l2 |  0.0071 |     sag |      720 |      0.3 |         1.0 |              1.0 |    12.060s | 33.874s | COMPLETE |\n| 3     |      l1 | 87.9641 | libli.. |      920 |      0.3 |         1.0 |              1.0 |    10.158s | 44.032s | COMPLETE |\n| 4     |      l2 |  0.0114 |     sag |      630 |      0.7 |         1.0 |              1.0 |     7.990s | 52.022s | COMPLETE |\n| 5     |      l2 |  0.0018 |     sag |      920 |      0.1 |         1.0 |              1.0 |    11.685s | 01m:04s | COMPLETE |\n| 6     |      l2 | 43.4053 |     sag |      780 |      0.3 |         1.0 |              1.0 |     8.361s | 01m:12s | COMPLETE |\n| 7     |      l2 |  2.0759 | libli.. |      470 |      0.2 |         1.0 |              1.0 |     8.213s | 01m:20s | COMPLETE |\n| 8     |    None |   0.043 |     sag |      110 |      1.0 |         1.0 |              1.0 |     7.450s | 01m:28s | COMPLETE |\n| 9     |      l1 | 46.0233 |    saga |      740 |      0.1 |         1.0 |              1.0 |     7.951s | 01m:36s | COMPLETE |\n| 10    |      l2 |  0.4557 |   lbfgs |      280 |      0.5 |         1.0 |              1.0 |     8.807s | 01m:44s | COMPLETE |\n| 11    |      l2 |  0.0013 | libli.. |      940 |      0.4 |         1.0 |              1.0 |     7.970s | 01m:52s | COMPLETE |\n| 12    |      l2 |  4.8717 | newto.. |      780 |      0.3 |         1.0 |              1.0 |     8.202s | 02m:01s | COMPLETE |\n| 13    |      l2 |  0.0324 | libli.. |     1000 |      0.0 |         1.0 |              1.0 |     7.676s | 02m:08s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 2\nBest parameters:\n --&gt; penalty: l2\n --&gt; C: 0.0071\n --&gt; solver: sag\n --&gt; max_iter: 720\n --&gt; l1_ratio: 0.3\nBest evaluation --&gt; roc_auc_ovr: 1.0\nTime elapsed: 02m:08s\nFit ---------------------------------------------\nTrain evaluation --&gt; roc_auc_ovr: 0.9991\nTest evaluation --&gt; roc_auc_ovr: 0.9977\nTime elapsed: 0.542s\nBootstrap ---------------------------------------\nEvaluation --&gt; roc_auc_ovr: 0.9984 \u00b1 0.001\nTime elapsed: 0.603s\n-------------------------------------------------\nTime: 02m:09s\n\n\nRunning hyperparameter tuning for LinearDiscriminantAnalysis...\n| trial |  solver | shrinkage | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |\n| ----- | ------- | --------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |    lsqr |       0.9 |      0.9221 |           0.9221 |     0.048s |  0.048s | COMPLETE |\n| 1     |   eigen |       1.0 |      0.9121 |           0.9221 |     0.027s |  0.074s | COMPLETE |\n| 2     |   eigen |       1.0 |      0.9121 |           0.9221 |     0.001s |  0.075s | COMPLETE |\n| 3     |    lsqr |       0.7 |      0.8638 |           0.9221 |     0.025s |  0.100s | COMPLETE |\n| 4     |   eigen |       0.7 |      0.9019 |           0.9221 |     0.024s |  0.124s | COMPLETE |\n| 5     |    lsqr |      auto |         1.0 |              1.0 |     0.025s |  0.149s | COMPLETE |\n| 6     |   eigen |       1.0 |      0.9121 |              1.0 |     0.000s |  0.149s | COMPLETE |\n| 7     |    lsqr |       1.0 |      0.9445 |              1.0 |     0.026s |  0.175s | COMPLETE |\n| 8     |     svd |      None |         1.0 |              1.0 |     0.025s |  0.200s | COMPLETE |\n| 9     |     svd |      None |         1.0 |              1.0 |     0.001s |  0.201s | COMPLETE |\n| 10    |    lsqr |      auto |         1.0 |              1.0 |     0.002s |  0.203s | COMPLETE |\n| 11    |     svd |      None |         1.0 |              1.0 |     0.002s |  0.205s | COMPLETE |\n| 12    |     svd |      None |         1.0 |              1.0 |     0.001s |  0.206s | COMPLETE |\n| 13    |     svd |      None |         1.0 |              1.0 |     0.001s |  0.207s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 5\nBest parameters:\n --&gt; solver: lsqr\n --&gt; shrinkage: auto\nBest evaluation --&gt; roc_auc_ovr: 1.0\nTime elapsed: 0.207s\nFit ---------------------------------------------\nTrain evaluation --&gt; roc_auc_ovr: 1.0\nTest evaluation --&gt; roc_auc_ovr: 1.0\nTime elapsed: 0.025s\nBootstrap ---------------------------------------\nEvaluation --&gt; roc_auc_ovr: 0.9998 \u00b1 0.0005\nTime elapsed: 0.038s\n-------------------------------------------------\nTime: 0.271s\n\n\nRunning hyperparameter tuning for RandomForest...\n| trial | n_estimators | criterion | max_depth | min_samples_split | min_samples_leaf | max_features | bootstrap | max_samples | ccp_alpha | roc_auc_ovr | best_roc_auc_ovr | time_trial | time_ht |    state |\n| ----- | ------------ | --------- | --------- | ----------------- | ---------------- | ------------ | --------- | ----------- | --------- | ----------- | ---------------- | ---------- | ------- | -------- |\n| 0     |          210 |      gini |        10 |                17 |               20 |          0.5 |     False |        None |       0.0 |      0.9803 |           0.9803 |     0.249s |  0.249s | COMPLETE |\n| 1     |          380 |      gini |         4 |                15 |                3 |          0.9 |     False |        None |      0.01 |      0.9816 |           0.9816 |     0.456s |  0.705s | COMPLETE |\n| 2     |          380 |   entropy |         6 |                 2 |               13 |          0.9 |     False |        None |      0.03 |      0.9944 |           0.9944 |     0.502s |  1.206s | COMPLETE |\n| 3     |          470 |      gini |        11 |                 9 |               18 |          nan |      True |         0.6 |     0.025 |      0.9569 |           0.9944 |     9.106s | 10.312s | COMPLETE |\n| 4     |          100 |   entropy |        12 |                14 |                6 |          0.9 |     False |         nan |     0.035 |         1.0 |              1.0 |     8.530s | 18.842s | COMPLETE |\n| 5     |          470 |   entropy |        13 |                11 |                1 |          nan |      True |         0.6 |      0.01 |         1.0 |              1.0 |     1.391s | 20.233s | COMPLETE |\n| 6     |          250 |      gini |        14 |                13 |               17 |          0.7 |      True |         nan |      0.02 |         1.0 |              1.0 |     0.754s | 20.987s | COMPLETE |\n| 7     |          220 |      gini |         5 |                10 |                7 |          0.5 |      True |         0.9 |     0.035 |      0.9981 |              1.0 |     0.712s | 21.699s | COMPLETE |\n| 8     |          130 |   entropy |         4 |                 6 |               11 |          0.9 |     False |         nan |      0.03 |         1.0 |              1.0 |     0.532s | 22.231s | COMPLETE |\n| 9     |          370 |      gini |        12 |                 2 |                4 |          0.5 |     False |         nan |      0.02 |      0.9916 |              1.0 |     0.823s | 23.055s | COMPLETE |\n| 10    |           10 |   entropy |        12 |                20 |                7 |         log2 |     False |         nan |     0.035 |         1.0 |              1.0 |     0.522s | 23.577s | COMPLETE |\n| 11    |           70 |   entropy |        13 |                12 |                1 |         None |      True |         0.5 |      0.01 |      0.9928 |              1.0 |     0.614s | 24.191s | COMPLETE |\n| 12    |          500 |   entropy |         9 |                 7 |                7 |          0.6 |      True |         0.6 |      0.01 |         1.0 |              1.0 |     1.139s | 25.330s | COMPLETE |\n| 13    |          140 |   entropy |        16 |                16 |                1 |          0.8 |      True |         0.7 |       0.0 |         1.0 |              1.0 |     0.750s | 26.080s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 4\nBest parameters:\n --&gt; n_estimators: 100\n --&gt; criterion: entropy\n --&gt; max_depth: 12\n --&gt; min_samples_split: 14\n --&gt; min_samples_leaf: 6\n --&gt; max_features: 0.9\n --&gt; bootstrap: False\n --&gt; max_samples: None\n --&gt; ccp_alpha: 0.035\nBest evaluation --&gt; roc_auc_ovr: 1.0\nTime elapsed: 26.080s\nFit ---------------------------------------------\nTrain evaluation --&gt; roc_auc_ovr: 0.9993\nTest evaluation --&gt; roc_auc_ovr: 1.0\nTime elapsed: 0.737s\nBootstrap ---------------------------------------\nEvaluation --&gt; roc_auc_ovr: 0.9936 \u00b1 0.0067\nTime elapsed: 0.721s\n-------------------------------------------------\nTime: 27.539s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 02m:40s\n-------------------------------------\nLogisticRegression         --&gt; roc_auc_ovr: 0.9984 \u00b1 0.001\nLinearDiscriminantAnalysis --&gt; roc_auc_ovr: 0.9998 \u00b1 0.0005 !\nRandomForest               --&gt; roc_auc_ovr: 0.9936 \u00b1 0.0067\n</pre> In\u00a0[4]: Copied! <pre>atom.results\n</pre> atom.results Out[4]: roc_auc_ovr_ht time_ht roc_auc_ovr_train roc_auc_ovr_test time_fit roc_auc_ovr_bootstrap time_bootstrap time LR 1.0 128.337325 0.9979 0.9977 0.542487 0.998413 0.602810 129.482622 LDA 1.0 0.207456 1.0000 0.9989 0.025409 0.999773 0.038035 0.270900 RF 1.0 26.080413 0.9951 0.9919 0.737324 0.993613 0.721398 27.539135 In\u00a0[5]: Copied! <pre># Show the score for some different metrics\natom.evaluate([\"precision_macro\", \"recall_macro\", \"jaccard_weighted\"])\n</pre> # Show the score for some different metrics atom.evaluate([\"precision_macro\", \"recall_macro\", \"jaccard_weighted\"]) Out[5]: precision_macro recall_macro jaccard_weighted LR 0.9429 0.9484 0.8924 LDA 0.9667 0.9762 0.9457 RF 0.8799 0.8915 0.7968 In\u00a0[10]: Copied! <pre># Some plots allow you to choose the target class to look at\natom.rf.plot_probabilities(rows=\"train\", target=0)\n</pre> # Some plots allow you to choose the target class to look at atom.rf.plot_probabilities(rows=\"train\", target=0) In\u00a0[8]: Copied! <pre>atom.lda.plot_shap_heatmap(target=2, show=7)\n</pre> atom.lda.plot_shap_heatmap(target=2, show=7)"}, {"location": "examples/multiclass_classification/#example-multiclass-classification", "title": "Example: Multiclass classification\u00b6", "text": "<p>This example shows how to compare the performance of three models on a multiclass classification task.</p> <p>Import the wine dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict wines into three groups (which cultivator it's from) using features based on the results of chemical analysis.</p>"}, {"location": "examples/multiclass_classification/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multiclass_classification/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multiclass_classification/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/multilabel_classification/", "title": "Multilabel classification", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import make_multilabel_classification\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier from sklearn.datasets import make_multilabel_classification In\u00a0[2]: Copied! <pre># Create data\nX, y = make_multilabel_classification(n_samples=300, n_classes=3, random_state=1)\n</pre> # Create data X, y = make_multilabel_classification(n_samples=300, n_classes=3, random_state=1) In\u00a0[3]: Copied! <pre># Note that for multioutput tasks, you must specify the `y` keyword\natom = ATOMClassifier(X, y=y, verbose=2, random_state=1)\n</pre> # Note that for multioutput tasks, you must specify the `y` keyword atom = ATOMClassifier(X, y=y, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Multilabel classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (300, 23)\nTrain set size: 240\nTest set size: 60\n-------------------------------------\nMemory: 51.73 kB\nScaled: False\nOutlier values: 29 (0.5%)\n\n</pre> In\u00a0[4]: Copied! <pre># Show the models that natively support multilabel tasks\natom.available_models()[[\"acronym\", \"model\", \"native_multilabel\"]]\n</pre> # Show the models that natively support multilabel tasks atom.available_models()[[\"acronym\", \"model\", \"native_multilabel\"]] Out[4]: acronym model native_multilabel 0 AdaB AdaBoost False 1 Bag Bagging False 2 BNB BernoulliNB False 3 CatB CatBoost False 4 CatNB CategoricalNB False 5 CNB ComplementNB False 6 Tree DecisionTree True 7 Dummy Dummy False 8 ETree ExtraTree True 9 ET ExtraTrees True 10 GNB GaussianNB False 11 GP GaussianProcess False 12 GBM GradientBoostingMachine False 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 LGB LightGBM False 16 LDA LinearDiscriminantAnalysis False 17 lSVM LinearSVM False 18 LR LogisticRegression False 19 MLP MultiLayerPerceptron True 20 MNB MultinomialNB False 21 PA PassiveAggressive False 22 Perc Perceptron False 23 QDA QuadraticDiscriminantAnalysis False 24 RNN RadiusNearestNeighbors True 25 RF RandomForest True 26 Ridge Ridge True 27 SGD StochasticGradientDescent False 28 SVM SupportVectorMachine False 29 XGB XGBoost False In\u00a0[5]: Copied! <pre>atom.run(models=[\"LDA\", \"RF\"], metric=\"recall_weighted\")\n</pre> atom.run(models=[\"LDA\", \"RF\"], metric=\"recall_weighted\") <pre>\nTraining ========================= &gt;&gt;\nModels: LDA, RF\nMetric: recall_weighted\n\n\nResults for LinearDiscriminantAnalysis:\nFit ---------------------------------------------\nTrain evaluation --&gt; recall_weighted: 0.9124\nTest evaluation --&gt; recall_weighted: 0.8351\nTime elapsed: 0.037s\n-------------------------------------------------\nTime: 0.037s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; recall_weighted: 1.0\nTest evaluation --&gt; recall_weighted: 0.8763\nTime elapsed: 0.170s\n-------------------------------------------------\nTime: 0.170s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 0.269s\n-------------------------------------\nLinearDiscriminantAnalysis --&gt; recall_weighted: 0.8351\nRandomForest               --&gt; recall_weighted: 0.8763 !\n</pre> In\u00a0[6]: Copied! <pre># Note that non-native multioutput models use a meta-estimator wrapper\nprint(f\"Estimator for LDA is: {atom.lda.estimator}\")\nprint(f\"Estimator for RF is: {atom.rf.estimator}\")\n</pre> # Note that non-native multioutput models use a meta-estimator wrapper print(f\"Estimator for LDA is: {atom.lda.estimator}\") print(f\"Estimator for RF is: {atom.rf.estimator}\") <pre>Estimator for LDA is: ClassifierChain(base_estimator=LinearDiscriminantAnalysis(), random_state=1)\nEstimator for RF is: RandomForestClassifier(n_jobs=1, random_state=1)\n</pre> In\u00a0[7]: Copied! <pre>from atom import ATOMModel\nfrom sklearn.multioutput import ClassifierChain\nfrom sklearn.linear_model import LogisticRegression\nfrom optuna.distributions import CategoricalDistribution, IntDistribution\n\ncustom_model = ATOMModel(\n    estimator=ClassifierChain(LogisticRegression(), cv=3),\n    name=\"chain\",\n    needs_scaling=True,\n    native_multilabel=True,\n)\n\natom.run(\n    models=custom_model,\n    n_trials=5,\n    ht_params={\n        \"distributions\": {\n            \"order\": CategoricalDistribution([[0, 1, 2], [2, 1, 0], [1, 2, 0]]),\n            \"base_estimator__max_iter\": IntDistribution(100, 200, step=10),\n            \"base_estimator__solver\": CategoricalDistribution([\"lbfgs\", \"newton-cg\"]),            \n        }\n    },\n)\n</pre> from atom import ATOMModel from sklearn.multioutput import ClassifierChain from sklearn.linear_model import LogisticRegression from optuna.distributions import CategoricalDistribution, IntDistribution  custom_model = ATOMModel(     estimator=ClassifierChain(LogisticRegression(), cv=3),     name=\"chain\",     needs_scaling=True,     native_multilabel=True, )  atom.run(     models=custom_model,     n_trials=5,     ht_params={         \"distributions\": {             \"order\": CategoricalDistribution([[0, 1, 2], [2, 1, 0], [1, 2, 0]]),             \"base_estimator__max_iter\": IntDistribution(100, 200, step=10),             \"base_estimator__solver\": CategoricalDistribution([\"lbfgs\", \"newton-cg\"]),                     }     }, ) <pre>\nTraining ========================= &gt;&gt;\nModels: chain\nMetric: recall_weighted\n\n\nRunning hyperparameter tuning for ClassifierChain...\n| trial |     order | base_estimator__max_iter | base_estimator__solver | recall_weighted | best_recall_weighted | time_trial | time_ht |    state |\n| ----- | --------- | ------------------------ | ---------------------- | --------------- | -------------------- | ---------- | ------- | -------- |\n| 0     | [2, 1, 0] |                      130 |                  lbfgs |          0.8831 |               0.8831 |     2.813s |  2.813s | COMPLETE |\n| 1     | [1, 2, 0] |                      150 |              newton-cg |          0.9091 |               0.9091 |     2.184s |  4.997s | COMPLETE |\n| 2     | [2, 1, 0] |                      170 |              newton-cg |          0.8701 |               0.9091 |     0.085s |  5.082s | COMPLETE |\n| 3     | [1, 2, 0] |                      200 |              newton-cg |          0.9221 |               0.9221 |     0.084s |  5.166s | COMPLETE |\n| 4     | [2, 1, 0] |                      100 |              newton-cg |          0.8701 |               0.9221 |     0.078s |  5.244s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 3\nBest parameters:\n --&gt; order: [1, 2, 0]\n --&gt; base_estimator__max_iter: 200\n --&gt; base_estimator__solver: newton-cg\nBest evaluation --&gt; recall_weighted: 0.9221\nTime elapsed: 5.244s\nFit ---------------------------------------------\nTrain evaluation --&gt; recall_weighted: 0.9021\nTest evaluation --&gt; recall_weighted: 0.866\nTime elapsed: 0.101s\n-------------------------------------------------\nTime: 5.345s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.397s\n-------------------------------------\nClassifierChain --&gt; recall_weighted: 0.866\n</pre> In\u00a0[8]: Copied! <pre>thresholds = atom.rf.get_best_threshold()\nprint(f\"Best threshold per target column: {thresholds}\")\n</pre> thresholds = atom.rf.get_best_threshold() print(f\"Best threshold per target column: {thresholds}\") <pre>Best threshold per target column: [0.7, 0.69, 0.63]\n</pre> In\u00a0[9]: Copied! <pre>atom.rf.evaluate(threshold=thresholds)\n</pre> atom.rf.evaluate(threshold=thresholds) Out[9]: <pre>accuracy              0.5667\nap                    0.8893\nf1_weighted           0.7274\njaccard_weighted      0.6271\nprecision_weighted    0.8269\nrecall_weighted       0.6495\nauc                   0.9213\nName: RF, dtype: float64</pre> In\u00a0[10]: Copied! <pre># Use the target parameter in plots to specify which target column to use\natom.plot_roc(target=2)\n</pre> # Use the target parameter in plots to specify which target column to use atom.plot_roc(target=2) In\u00a0[11]: Copied! <pre># When the target parameter also specifies the class, use format (column, class)\natom.plot_probabilities(models=\"chain\", target=(2, 1))\n</pre> # When the target parameter also specifies the class, use format (column, class) atom.plot_probabilities(models=\"chain\", target=(2, 1)) In\u00a0[12]: Copied! <pre>with atom.canvas(figsize=(900, 600)):\n    atom.plot_calibration(target=0)\n    atom.plot_calibration(target=1)\n</pre> with atom.canvas(figsize=(900, 600)):     atom.plot_calibration(target=0)     atom.plot_calibration(target=1)"}, {"location": "examples/multilabel_classification/#example-multilabel-classification", "title": "Example: Multilabel classification\u00b6", "text": "<p>This example shows how to use ATOM to solve a multilabel classification problem.</p> <p>The data used is a synthetic dataset created using sklearn's make_multilabel_classification function.</p>"}, {"location": "examples/multilabel_classification/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multilabel_classification/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multilabel_classification/#add-custom-multilabel-models", "title": "Add custom multilabel models\u00b6", "text": "<p>To use your own meta-estimator with custom parameters, add it as a custom model. It's also possible to tune the hyperparameters of this custom meta-estimator.</p>"}, {"location": "examples/multilabel_classification/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/multioutput_regression/", "title": "Multioutput regression", "text": "In\u00a0[1]: Copied! <pre># Disable annoying tf warnings\nimport os\nos.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"\n\nfrom tensorflow import get_logger\nget_logger().setLevel('ERROR')\n\nimport numpy as np\nfrom atom import ATOMRegressor, ATOMModel\nfrom sklearn.datasets import make_regression\n\nfrom scikeras.wrappers import KerasRegressor\nfrom keras.models import Sequential\nfrom keras.layers import Dense\n</pre> # Disable annoying tf warnings import os os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"3\"  from tensorflow import get_logger get_logger().setLevel('ERROR')  import numpy as np from atom import ATOMRegressor, ATOMModel from sklearn.datasets import make_regression  from scikeras.wrappers import KerasRegressor from keras.models import Sequential from keras.layers import Dense In\u00a0[2]: Copied! <pre># Create data\nX, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=3)\n</pre> # Create data X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=3) In\u00a0[3]: Copied! <pre># Create the neural network\nclass NeuralNetwork(KerasRegressor):\n    \"\"\"Multioutput multilayer perceptron.\"\"\"\n\n    @staticmethod\n    def _keras_build_fn(n_inputs, n_outputs, **kwargs):\n        \"\"\"Create the model's architecture.\"\"\"\n        model = Sequential()\n        model.add(Dense(20, input_dim=n_inputs, activation=\"relu\"))\n        model.add(Dense(20, activation=\"relu\"))\n        model.add(Dense(n_outputs))\n        model.compile(loss=\"mse\", optimizer=\"adam\")\n        return model\n</pre> # Create the neural network class NeuralNetwork(KerasRegressor):     \"\"\"Multioutput multilayer perceptron.\"\"\"      @staticmethod     def _keras_build_fn(n_inputs, n_outputs, **kwargs):         \"\"\"Create the model's architecture.\"\"\"         model = Sequential()         model.add(Dense(20, input_dim=n_inputs, activation=\"relu\"))         model.add(Dense(20, activation=\"relu\"))         model.add(Dense(n_outputs))         model.compile(loss=\"mse\", optimizer=\"adam\")         return model In\u00a0[4]: Copied! <pre># Convert the model to an ATOM model\nmodel = ATOMModel(\n    estimator=NeuralNetwork(n_inputs=5, n_outputs=y.shape[1], epochs=100, verbose=0),\n    name=\"NN\",\n    needs_scaling=True,  # Applies automated feature scaling before fitting\n    native_multioutput=True,  # Do not use a multioutput meta-estimator wrapper\n)\n</pre> # Convert the model to an ATOM model model = ATOMModel(     estimator=NeuralNetwork(n_inputs=5, n_outputs=y.shape[1], epochs=100, verbose=0),     name=\"NN\",     needs_scaling=True,  # Applies automated feature scaling before fitting     native_multioutput=True,  # Do not use a multioutput meta-estimator wrapper ) In\u00a0[5]: Copied! <pre>atom = ATOMRegressor(X, y=y, verbose=2, random_state=1)\n</pre> atom = ATOMRegressor(X, y=y, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Multioutput regression.\n\nDataset stats ==================== &gt;&gt;\nShape: (1000, 13)\nTrain set size: 800\nTest set size: 200\n-------------------------------------\nMemory: 104.13 kB\nScaled: True\nOutlier values: 27 (0.3%)\n\n</pre> In\u00a0[6]: Copied! <pre># Show the models that natively support multioutput tasks\natom.available_models()[[\"acronym\", \"model\", \"native_multioutput\"]]\n</pre> # Show the models that natively support multioutput tasks atom.available_models()[[\"acronym\", \"model\", \"native_multioutput\"]] Out[6]: acronym model native_multioutput 0 AdaB AdaBoost False 1 ARD AutomaticRelevanceDetermination False 2 Bag Bagging False 3 BR BayesianRidge False 4 CatB CatBoost False 5 Tree DecisionTree True 6 Dummy Dummy False 7 EN ElasticNet False 8 ETree ExtraTree True 9 ET ExtraTrees True 10 GP GaussianProcess False 11 GBM GradientBoostingMachine False 12 Huber HuberRegression False 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 Lasso Lasso False 16 Lars LeastAngleRegression False 17 LGB LightGBM False 18 lSVM LinearSVM False 19 MLP MultiLayerPerceptron False 20 OLS OrdinaryLeastSquares False 21 OMP OrthogonalMatchingPursuit False 22 PA PassiveAggressive False 23 RNN RadiusNearestNeighbors True 24 RF RandomForest True 25 Ridge Ridge False 26 SGD StochasticGradientDescent False 27 SVM SupportVectorMachine False 28 XGB XGBoost False In\u00a0[7]: Copied! <pre># Note we only added 5 informative features to the dataset, let's remove the rest\n# If we use a model with no native support for multioutput as solver, specify the\n# rfe's importance_getter parameter and return the mean of the coefficients over the\n# target columns\natom.feature_selection(\n    strategy=\"rfe\",\n    solver=\"ols\",  # This becomes MultiOutputRegressor(OLS)\n    n_features=5,\n    importance_getter=lambda x: np.mean([e.coef_ for e in x.estimators_], axis=0),\n)\n</pre> # Note we only added 5 informative features to the dataset, let's remove the rest # If we use a model with no native support for multioutput as solver, specify the # rfe's importance_getter parameter and return the mean of the coefficients over the # target columns atom.feature_selection(     strategy=\"rfe\",     solver=\"ols\",  # This becomes MultiOutputRegressor(OLS)     n_features=5,     importance_getter=lambda x: np.mean([e.coef_ for e in x.estimators_], axis=0), ) <pre>Fitting FeatureSelector...\nPerforming feature selection ...\n --&gt; rfe selected 5 features from the dataset.\n   --&gt; Dropping feature x0 (rank 6).\n   --&gt; Dropping feature x5 (rank 5).\n   --&gt; Dropping feature x6 (rank 3).\n   --&gt; Dropping feature x7 (rank 2).\n   --&gt; Dropping feature x9 (rank 4).\n</pre> In\u00a0[8]: Copied! <pre># Let's train a native, non-native and our custom model\natom.run(models=[\"Lasso\", \"RF\", model], metric=\"mse\")\n</pre> # Let's train a native, non-native and our custom model atom.run(models=[\"Lasso\", \"RF\", model], metric=\"mse\") <pre>\nTraining ========================= &gt;&gt;\nModels: Lasso, RF, NN\nMetric: mse\n\n\nResults for Lasso:\nFit ---------------------------------------------\nTrain evaluation --&gt; mse: -5.1516\nTest evaluation --&gt; mse: -5.5774\nTime elapsed: 0.031s\n-------------------------------------------------\nTime: 0.031s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; mse: -200.7336\nTest evaluation --&gt; mse: -1494.3406\nTime elapsed: 0.706s\n-------------------------------------------------\nTime: 0.706s\n\n\nResults for NeuralNetwork:\nFit ---------------------------------------------\nTrain evaluation --&gt; mse: -111.3789\nTest evaluation --&gt; mse: -105.2649\nTime elapsed: 2.372s\n-------------------------------------------------\nTime: 2.372s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 3.116s\n-------------------------------------\nLasso         --&gt; mse: -5.5774 !\nRandomForest  --&gt; mse: -1494.3406 ~\nNeuralNetwork --&gt; mse: -105.2649\n</pre> In\u00a0[9]: Copied! <pre># And check which of the models used a meta-estimator wrapper\nfor m in atom.models:\n    print(f\"Estimator for {m} is: {atom[m].estimator}\")\n</pre> # And check which of the models used a meta-estimator wrapper for m in atom.models:     print(f\"Estimator for {m} is: {atom[m].estimator}\") <pre>Estimator for Lasso is: MultiOutputRegressor(estimator=Lasso(random_state=1), n_jobs=1)\nEstimator for RF is: RandomForestRegressor(n_jobs=1, random_state=1)\nEstimator for NN is: NeuralNetwork(\n\tmodel=None\n\tbuild_fn=None\n\twarm_start=False\n\trandom_state=1\n\toptimizer=rmsprop\n\tloss=None\n\tmetrics=None\n\tbatch_size=None\n\tvalidation_batch_size=None\n\tverbose=0\n\tcallbacks=None\n\tvalidation_split=0.0\n\tshuffle=True\n\trun_eagerly=False\n\tepochs=100\n\tn_inputs=5\n\tn_outputs=3\n\tname=NN\n\tneeds_scaling=True\n\tnative_multioutput=True\n\tnative_multilabel=False\n\thas_validation=None\n)\n</pre> In\u00a0[10]: Copied! <pre># Use the target parameter in plots to specify which target column to use\natom.plot_residuals(target=2)\n</pre> # Use the target parameter in plots to specify which target column to use atom.plot_residuals(target=2) In\u00a0[11]: Copied! <pre>with atom.canvas(3, 1, figsize=(900, 1300)):\n    atom.plot_errors(target=0)\n    atom.plot_errors(target=1)\n    atom.plot_errors(target=2)\n</pre> with atom.canvas(3, 1, figsize=(900, 1300)):     atom.plot_errors(target=0)     atom.plot_errors(target=1)     atom.plot_errors(target=2) <pre>\n---------------------------------------------------------------------------\nValueError                                Traceback (most recent call last)\nCell In[11], line 2\n      1 with atom.canvas(3, 1, figsize=(900, 1300)):\n----&gt; 2     atom.plot_errors(target=0)\n      3     atom.plot_errors(target=1)\n      4     atom.plot_errors(target=2)\n\nFile ~\\Documents\\Python\\ATOM\\atom\\utils\\utils.py:2712, in crash.&lt;locals&gt;.wrapper(*args, **kwargs)\n   2709     cache[\"last_exception\"] = ex\n   2710     args[0].logger.exception(\"Exception encountered:\")\n-&gt; 2712 raise ex\n\nFile ~\\Documents\\Python\\ATOM\\atom\\utils\\utils.py:2704, in crash.&lt;locals&gt;.wrapper(*args, **kwargs)\n   2701 @wraps(f)\n   2702 def wrapper(*args, **kwargs) -&gt; Any:\n   2703     try:  # Run the function\n-&gt; 2704         return f(*args, **kwargs)\n   2706     except Exception as ex:\n   2707         # If exception is not the same as last, write to log\n   2708         if ex is not cache[\"last_exception\"] and args[0].logger:\n\nFile ~\\Documents\\Python\\ATOM\\atom\\plots\\predictionplot.py:691, in PredictionPlot.plot_errors(self, models, rows, target, title, legend, figsize, filename, display)\n    689         from atom.models import OrdinaryLeastSquares\n    690         model = OrdinaryLeastSquares(goal=self.task.goal, branches=self._branches)\n--&gt; 691         estimator = model._get_est().fit(bk.DataFrame(y_true), y_pred)\n    693         fig.add_trace(\n    694             self._draw_line(\n    695                 x=(x := np.linspace(y_true.min(), y_true.max(), 100)),\n   (...)\n    703             )\n    704         )\n    706 self._draw_straight_line(y=\"diagonal\", xaxis=xaxis, yaxis=yaxis)\n\nFile ~\\Documents\\Python\\ATOM\\venv310\\lib\\site-packages\\sklearn\\base.py:1152, in _fit_context.&lt;locals&gt;.decorator.&lt;locals&gt;.wrapper(estimator, *args, **kwargs)\n   1145     estimator._validate_params()\n   1147 with config_context(\n   1148     skip_parameter_validation=(\n   1149         prefer_skip_nested_validation or global_skip_validation\n   1150     )\n   1151 ):\n-&gt; 1152     return fit_method(estimator, *args, **kwargs)\n\nFile ~\\Documents\\Python\\ATOM\\venv310\\lib\\site-packages\\sklearn\\multioutput.py:248, in _MultiOutputEstimator.fit(self, X, y, sample_weight, **fit_params)\n    245     check_classification_targets(y)\n    247 if y.ndim == 1:\n--&gt; 248     raise ValueError(\n    249         \"y must have at least two dimensions for \"\n    250         \"multi-output regression but has only one.\"\n    251     )\n    253 if _routing_enabled():\n    254     routed_params = process_routing(\n    255         obj=self,\n    256         method=\"fit\",\n    257         other_params=fit_params,\n    258         sample_weight=sample_weight,\n    259     )\n\nValueError: y must have at least two dimensions for multi-output regression but has only one.</pre>"}, {"location": "examples/multioutput_regression/#example-multioutput-regression", "title": "Example: Multioutput regression\u00b6", "text": "<p>This example shows how to use ATOM to make preditions on a multioutput regression dataset. One of the models used is a MLP regressor implemented with Keras using scikeras.</p> <p>The data used is a synthetic dataset created using sklearn's make_regression function.</p>"}, {"location": "examples/multioutput_regression/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/multioutput_regression/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/multioutput_regression/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/nlp/", "title": "NLP", "text": "In\u00a0[1]: Copied! <pre>import numpy as np\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import fetch_20newsgroups\n</pre> import numpy as np from atom import ATOMClassifier from sklearn.datasets import fetch_20newsgroups In\u00a0[2]: Copied! <pre># Use only a subset of the available topics for faster processing\nX_text, y_text = fetch_20newsgroups(\n    return_X_y=True,\n    categories=[\n        'sci.med',\n        'comp.windows.x',\n        'misc.forsale',\n        'rec.autos',\n    ],\n    shuffle=True,\n    random_state=1,\n)\nX_text = np.array(X_text).reshape(-1, 1)\n</pre> # Use only a subset of the available topics for faster processing X_text, y_text = fetch_20newsgroups(     return_X_y=True,     categories=[         'sci.med',         'comp.windows.x',         'misc.forsale',         'rec.autos',     ],     shuffle=True,     random_state=1, ) X_text = np.array(X_text).reshape(-1, 1) In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X_text, y_text, index=True, test_size=0.3, verbose=2, random_state=1)\n</pre> atom = ATOMClassifier(X_text, y_text, index=True, test_size=0.3, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Multiclass classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (2366, 2)\nTrain set size: 1657\nTest set size: 709\n-------------------------------------\nMemory: 122.87 kB\nScaled: False\nCategorical features: 1 (100.0%)\n\n</pre> In\u00a0[4]: Copied! <pre>atom.dataset  # Note that the feature is automatically named 'corpus'\n</pre> atom.dataset  # Note that the feature is automatically named 'corpus' Out[4]: corpus target 1731 From: rlm@helen.surfcty.com (Robert L. McMilli... 0 1496 From: carl@SOL1.GPS.CALTECH.EDU (Carl J Lydick... 3 1290 From: thssjxy@iitmax.iit.edu (Smile)\\nSubject:... 1 2021 From: c23st@kocrsv01.delcoelect.com (Spiros Tr... 2 142 From: ginkgo@ecsvax.uncecs.edu (J. Geary Morto... 1 ... ... ... 510 From: mary@uicsl.csl.uiuc.edu (Mary E. Allison... 3 1948 From: ndd@sunbar.mc.duke.edu (Ned Danieley)\\nS... 0 798 From: kk@unisql.UUCP (Kerry Kimbrough)\\nSubjec... 0 2222 From: hamachi@adobe.com (Gordon Hamachi)\\nSubj... 2 2215 From: mobasser@vu-vlsi.ee.vill.edu (Bijan Moba... 2 <p>2366 rows \u00d7 2 columns</p> In\u00a0[5]: Copied! <pre># Let's have a look at the first document\natom.corpus[0]\n</pre> # Let's have a look at the first document atom.corpus[0] Out[5]: <pre>'From: caf@omen.UUCP (Chuck Forsberg WA7KGX)\\nSubject: Re: My New Diet --&gt; IT WORKS GREAT !!!!\\nOrganization: Omen Technology INC, Portland Rain Forest\\nLines: 32\\n\\nIn article &lt;1qk6v3INNrm6@lynx.unm.edu&gt; bhjelle@carina.unm.edu () writes:\\n&gt;\\n&gt;Gordon Banks:\\n&gt;\\n&gt;&gt;a lot to keep from going back to morbid obesity.  I think all\\n&gt;&gt;of us cycle.  One\\'s success depends on how large the fluctuations\\n&gt;&gt;in the cycle are.  Some people can cycle only 5 pounds.  Unfortunately,\\n&gt;&gt;I\\'m not one of them.\\n&gt;&gt;\\n&gt;&gt;\\n&gt;This certainly describes my situation perfectly. For me there is\\n&gt;a constant dynamic between my tendency to eat, which appears to\\n&gt;be totally limitless, and the purely conscious desire to not\\n&gt;put on too much weight. When I get too fat, I just diet/exercise\\n&gt;more (with varying degrees of success) to take off the\\n&gt;extra weight. Usually I cycle within a 15 lb range, but\\n&gt;smaller and larger cycles occur as well. I\\'m always afraid\\n&gt;that this method will stop working someday, but usually\\n&gt;I seem to be able to hold the weight gain in check.\\n&gt;This is one reason I have a hard time accepting the notion\\n&gt;of some metabolic derangement associated with cycle dieting\\n&gt;(that results in long-term weight gain). I have been cycle-\\n&gt;dieting for at least 20 years without seeing such a change.\\n\\nAs mentioned in Adiposity 101, only some experience weight\\nrebound.  The fact that you don\\'t doesn\\'t prove it doesn\\'t\\nhappen to others.\\n-- \\nChuck Forsberg WA7KGX          ...!tektronix!reed!omen!caf \\nAuthor of YMODEM, ZMODEM, Professional-YAM, ZCOMM, and DSZ\\n  Omen Technology Inc    \"The High Reliability Software\"\\n17505-V NW Sauvie IS RD   Portland OR 97231   503-621-3406\\n'</pre> In\u00a0[6]: Copied! <pre># Clean the documents from noise (emails, numbers, etc...)\natom.textclean()\n</pre> # Clean the documents from noise (emails, numbers, etc...) atom.textclean() <pre>Fitting TextCleaner...\nCleaning the corpus...\n --&gt; Decoding unicode characters to ascii.\n --&gt; Converting text to lower case.\n --&gt; Dropping emails from documents.\n --&gt; Dropping URL links from documents.\n --&gt; Dropping HTML tags from documents.\n --&gt; Dropping emojis from documents.\n --&gt; Dropping numbers from documents.\n --&gt; Dropping punctuation from the text.\n</pre> In\u00a0[7]: Copied! <pre># Check how the first document changed\natom.corpus[0]\n</pre> # Check how the first document changed atom.corpus[0] Out[7]: <pre>'from  chuck forsberg wa7kgx\\nsubject re my new diet  it works great \\norganization omen technology inc portland rain forest\\nlines \\n\\nin article    writes\\n\\ngordon banks\\n\\na lot to keep from going back to morbid obesity  i think all\\nof us cycle  ones success depends on how large the fluctuations\\nin the cycle are  some people can cycle only  pounds  unfortunately\\nim not one of them\\n\\n\\nthis certainly describes my situation perfectly for me there is\\na constant dynamic between my tendency to eat which appears to\\nbe totally limitless and the purely conscious desire to not\\nput on too much weight when i get too fat i just dietexercise\\nmore with varying degrees of success to take off the\\nextra weight usually i cycle within a  lb range but\\nsmaller and larger cycles occur as well im always afraid\\nthat this method will stop working someday but usually\\ni seem to be able to hold the weight gain in check\\nthis is one reason i have a hard time accepting the notion\\nof some metabolic derangement associated with cycle dieting\\nthat results in longterm weight gain i have been cycle\\ndieting for at least  years without seeing such a change\\n\\nas mentioned in adiposity  only some experience weight\\nrebound  the fact that you dont doesnt prove it doesnt\\nhappen to others\\n \\nchuck forsberg wa7kgx          tektronixreedomencaf \\nauthor of ymodem zmodem professionalyam zcomm and dsz\\n  omen technology inc    the high reliability software\\nv nw sauvie is rd   portland or    \\n'</pre> In\u00a0[8]: Copied! <pre># Convert the strings to a sequence of words\natom.tokenize()\n</pre> # Convert the strings to a sequence of words atom.tokenize() <pre>Fitting Tokenizer...\nTokenizing the corpus...\n</pre> In\u00a0[9]: Copied! <pre># Print the first few words of the first document\natom.corpus[0][:7]\n</pre> # Print the first few words of the first document atom.corpus[0][:7] Out[9]: <pre>['from', 'chuck', 'forsberg', 'wa7kgx', 'subject', 're', 'my']</pre> In\u00a0[10]: Copied! <pre># Normalize the text to a predefined standard\natom.textnormalize(stopwords=\"english\", lemmatize=True)\n</pre> # Normalize the text to a predefined standard atom.textnormalize(stopwords=\"english\", lemmatize=True) <pre>Fitting TextNormalizer...\nNormalizing the corpus...\n --&gt; Dropping stopwords.\n --&gt; Applying lemmatization.\n</pre> In\u00a0[11]: Copied! <pre>atom.corpus[0][:7]  # Check changes...\n</pre> atom.corpus[0][:7]  # Check changes... Out[11]: <pre>['chuck', 'forsberg', 'wa7kgx', 'subject', 'new', 'diet', 'work']</pre> In\u00a0[12]: Copied! <pre># Visualize the most common words with a wordcloud\natom.plot_wordcloud(figsize=(700, 500))\n</pre> # Visualize the most common words with a wordcloud atom.plot_wordcloud(figsize=(700, 500)) In\u00a0[13]: Copied! <pre># Have a look at the most frequent bigrams\natom.plot_ngrams(2)\n</pre> # Have a look at the most frequent bigrams atom.plot_ngrams(2) In\u00a0[14]: Copied! <pre># Create the bigrams using the tokenizer\natom.tokenize(bigram_freq=215)\n</pre> # Create the bigrams using the tokenizer atom.tokenize(bigram_freq=215) <pre>Fitting Tokenizer...\nTokenizing the corpus...\n --&gt; Creating 7 bigrams on 3128 locations.\n</pre> In\u00a0[15]: Copied! <pre>atom.bigrams_\n</pre> atom.bigrams_ Out[15]: bigram frequency 0 x_x 1168 1 line_article 532 2 line_nntppostinghost 389 3 organization_university 331 4 gordon_bank 266 5 distribution_usa 227 6 line_distribution 215 In\u00a0[16]: Copied! <pre># As a last step before modelling, convert the words to vectors\natom.vectorize(strategy=\"tfidf\")\n</pre> # As a last step before modelling, convert the words to vectors atom.vectorize(strategy=\"tfidf\") <pre>Fitting Vectorizer...\nVectorizing the corpus...\n</pre> In\u00a0[17]: Copied! <pre># The dimensionality of the dataset has increased a lot!\natom.shape\n</pre> # The dimensionality of the dataset has increased a lot! atom.shape Out[17]: <pre>(2366, 24176)</pre> In\u00a0[18]: Copied! <pre># Note that the data is sparse and the columns are named\n# after the words they are embedding\natom.dtypes\n</pre> # Note that the data is sparse and the columns are named # after the words they are embedding atom.dtypes Out[18]: <pre>corpus_000000e5    Sparse[float64, 0]\ncorpus_00000ee5    Sparse[float64, 0]\ncorpus_000010af    Sparse[float64, 0]\ncorpus_0007259d    Sparse[float64, 0]\ncorpus_00072a27    Sparse[float64, 0]\n                          ...        \ncorpus_zurich      Sparse[float64, 0]\ncorpus_zvi         Sparse[float64, 0]\ncorpus_zx          Sparse[float64, 0]\ncorpus_zz          Sparse[float64, 0]\ntarget                          int64\nLength: 24176, dtype: object</pre> In\u00a0[19]: Copied! <pre># When the dataset is sparse, stats() shows the density\natom.stats()\n</pre> # When the dataset is sparse, stats() shows the density atom.stats() <pre>Dataset stats ==================== &gt;&gt;\nShape: (2366, 24176)\nTrain set size: 1657\nTest set size: 709\n-------------------------------------\nMemory: 2.54 MB\nSparse: True\nDensity: 0.35%\n</pre> In\u00a0[20]: Copied! <pre># Check which models have support for sparse matrices\natom.available_models()[[\"acronym\", \"model\", \"accepts_sparse\"]]\n</pre> # Check which models have support for sparse matrices atom.available_models()[[\"acronym\", \"model\", \"accepts_sparse\"]] Out[20]: acronym model accepts_sparse 0 AdaB AdaBoost True 1 Bag Bagging True 2 BNB BernoulliNB True 3 CatB CatBoost True 4 CatNB CategoricalNB True 5 CNB ComplementNB True 6 Tree DecisionTree True 7 Dummy Dummy False 8 ETree ExtraTree True 9 ET ExtraTrees True 10 GNB GaussianNB False 11 GP GaussianProcess False 12 GBM GradientBoostingMachine True 13 hGBM HistGradientBoosting False 14 KNN KNearestNeighbors True 15 LGB LightGBM True 16 LDA LinearDiscriminantAnalysis False 17 lSVM LinearSVM True 18 LR LogisticRegression True 19 MLP MultiLayerPerceptron True 20 MNB MultinomialNB True 21 PA PassiveAggressive True 22 Perc Perceptron False 23 QDA QuadraticDiscriminantAnalysis False 24 RNN RadiusNearestNeighbors True 25 RF RandomForest True 26 Ridge Ridge True 27 SGD StochasticGradientDescent True 28 SVM SupportVectorMachine True 29 XGB XGBoost True In\u00a0[21]: Copied! <pre># Train the model\natom.run(models=\"RF\", metric=\"f1_weighted\")\n</pre> # Train the model atom.run(models=\"RF\", metric=\"f1_weighted\") <pre>\nTraining ========================= &gt;&gt;\nModels: RF\nMetric: f1_weighted\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1_weighted: 1.0\nTest evaluation --&gt; f1_weighted: 0.9181\nTime elapsed: 02m:24s\n-------------------------------------------------\nTime: 02m:24s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 02m:24s\n-------------------------------------\nRandomForest --&gt; f1_weighted: 0.9181\n</pre> In\u00a0[22]: Copied! <pre>atom.evaluate()\n</pre> atom.evaluate() Out[22]: ba f1_weighted jaccard_weighted mcc precision_weighted recall_weighted RF 0.9183 0.9181 0.8486 0.8918 0.9206 0.9182 In\u00a0[23]: Copied! <pre>atom.plot_confusion_matrix(figsize=(700, 600))\n</pre> atom.plot_confusion_matrix(figsize=(700, 600)) In\u00a0[24]: Copied! <pre>atom.plot_shap_decision(rows=0, show=15)\n</pre> atom.plot_shap_decision(rows=0, show=15) In\u00a0[25]: Copied! <pre>atom.plot_shap_beeswarm(target=0, show=15)\n</pre> atom.plot_shap_beeswarm(target=0, show=15) <pre>100%|===================| 2827/2836 [02:38&lt;00:00]        </pre>"}, {"location": "examples/nlp/#example-nlp", "title": "Example: NLP\u00b6", "text": "<p>This example shows how to use ATOM to quickly go from raw text data to model predictions.</p> <p>Import the 20 newsgroups text dataset from sklearn.datasets. The dataset comprises around 18000 articles on 20 topics. The goal is to predict the topic of every article.</p>"}, {"location": "examples/nlp/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/nlp/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/nlp/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/pruning/", "title": "Pruning", "text": "In\u00a0[1]: Copied! <pre># Import packages\nfrom sklearn.datasets import load_breast_cancer\nfrom optuna.pruners import HyperbandPruner\nfrom atom import ATOMClassifier\n</pre> # Import packages from sklearn.datasets import load_breast_cancer from optuna.pruners import HyperbandPruner from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX, y = load_breast_cancer(return_X_y=True)\n</pre> # Load the data X, y = load_breast_cancer(return_X_y=True) In\u00a0[3]: Copied! <pre># Initialize atom\natom = ATOMClassifier(X, y, verbose=2, random_state=1)\n</pre> # Initialize atom atom = ATOMClassifier(X, y, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (569, 31)\nTrain set size: 456\nTest set size: 113\n-------------------------------------\nMemory: 141.24 kB\nScaled: False\nOutlier values: 167 (1.2%)\n\n</pre> In\u00a0[4]: Copied! <pre># Use ht_params to specify a custom pruner\n# Note that pruned trials show the number of iterations it completed\natom.run(\n    models=\"SGD\",\n    metric=\"f1\",\n    n_trials=25,\n    ht_params={\n        \"distributions\": [\"penalty\", \"max_iter\"],\n        \"pruner\": HyperbandPruner(),\n    }\n)\n</pre> # Use ht_params to specify a custom pruner # Note that pruned trials show the number of iterations it completed atom.run(     models=\"SGD\",     metric=\"f1\",     n_trials=25,     ht_params={         \"distributions\": [\"penalty\", \"max_iter\"],         \"pruner\": HyperbandPruner(),     } ) <pre>\nTraining ========================= &gt;&gt;\nModels: SGD\nMetric: f1\n\n\nRunning hyperparameter tuning for StochasticGradientDescent...\n| trial | penalty | max_iter |      f1 | best_f1 | time_trial | time_ht |    state |\n| ----- | ------- | -------- | ------- | ------- | ---------- | ------- | -------- |\n| 0     |      l1 |      650 |  0.9558 |  0.9558 |     2.801s |  2.801s | COMPLETE |\n| 1     | elast.. |     1050 |  0.9744 |  0.9744 |     4.590s |  7.390s | COMPLETE |\n| 2     | elast.. |      500 |  0.9828 |  0.9828 |     0.033s |  7.423s |   PRUNED |\n| 3     |    None |      700 |  0.9739 |  0.9828 |     2.951s | 10.374s | COMPLETE |\n| 4     |      l1 |     1400 |  0.9735 |  0.9828 |     0.033s | 10.407s |   PRUNED |\n| 5     |    None |     1400 |  0.9735 |  0.9828 |     5.994s | 16.401s | COMPLETE |\n| 6     |      l2 |     1200 |  0.9825 |  0.9828 |     5.246s | 21.647s | COMPLETE |\n| 7     |      l2 |     1250 |  0.9825 |  0.9828 |     5.436s | 27.083s | COMPLETE |\n| 8     |    None |      600 |  0.9828 |  0.9828 |     0.023s | 27.106s |   PRUNED |\n| 9     |      l1 |      600 |  0.9402 |  0.9828 |     0.030s | 27.136s |   PRUNED |\n| 10    |      l2 |      950 |  0.9565 |  0.9828 |     4.118s | 31.254s | COMPLETE |\n| 11    |      l2 |     1200 |  0.9825 |  0.9828 |     0.005s | 31.259s | COMPLETE |\n| 12    |      l2 |     1200 |  0.9825 |  0.9828 |     0.005s | 31.264s | COMPLETE |\n| 13    |      l2 |     1200 |  0.9825 |  0.9828 |     0.005s | 31.269s | COMPLETE |\n| 14    |      l2 |     1500 |  0.9573 |  0.9828 |     0.038s | 31.306s |   PRUNED |\n| 15    |      l2 |      950 |  0.9565 |  0.9828 |     0.005s | 31.311s | COMPLETE |\n| 16    |      l2 |     1100 |  0.9391 |  0.9828 |     0.040s | 31.351s |   PRUNED |\n| 17    |      l2 |      850 |  0.9831 |  0.9831 |     0.030s | 31.381s |   PRUNED |\n| 18    | elast.. |     1300 |   0.931 |  0.9831 |     0.029s | 31.410s |   PRUNED |\n| 19    |      l2 |     1300 |  0.9649 |  0.9831 |     0.067s | 31.478s |   PRUNED |\n| 20    |      l2 |      800 |  0.9661 |  0.9831 |     0.039s | 31.517s |   PRUNED |\n| 21    |      l2 |     1150 |  0.9402 |  0.9831 |     0.032s | 31.548s |   PRUNED |\n| 22    |      l2 |     1300 |  0.9573 |  0.9831 |     0.038s | 31.586s |   PRUNED |\n| 23    |      l2 |     1250 |  0.9825 |  0.9831 |     0.008s | 31.594s | COMPLETE |\n| 24    |      l2 |     1050 |  0.9565 |  0.9831 |     0.070s | 31.665s |   PRUNED |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 6\nBest parameters:\n --&gt; penalty: l2\n --&gt; max_iter: 1200\nBest evaluation --&gt; f1: 0.9825\nTime elapsed: 31.665s\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.993\nTest evaluation --&gt; f1: 0.9722\nTime elapsed: 8.384s\n-------------------------------------------------\nTime: 40.049s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 40.301s\n-------------------------------------\nStochasticGradientDescent --&gt; f1: 0.9722\n</pre> In\u00a0[5]: Copied! <pre>atom.plot_trials()\n</pre> atom.plot_trials() In\u00a0[6]: Copied! <pre>atom.plot_hyperparameter_importance()\n</pre> atom.plot_hyperparameter_importance()"}, {"location": "examples/pruning/#example-pruning", "title": "Example: Pruning\u00b6", "text": "<p>This example shows an advanced example on how to use hyperparameter tuning with pruning.</p> <p>Import the breast cancer dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.</p>"}, {"location": "examples/pruning/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/pruning/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/pruning/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/ray_backend/", "title": "Ray backend", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport ray\nimport pandas as pd\nfrom atom import ATOMClassifier\nfrom sklearn.datasets import make_classification\n</pre> # Import packages import ray import pandas as pd from atom import ATOMClassifier from sklearn.datasets import make_classification In\u00a0[2]: Copied! <pre># Use a small dataset for illustration purposes\nX, y = make_classification(n_samples=10000, n_features=10, random_state=1)\n</pre> # Use a small dataset for illustration purposes X, y = make_classification(n_samples=10000, n_features=10, random_state=1) In\u00a0[3]: Copied! <pre># Note we already specify the number of cores for parallel execution here\natom = ATOMClassifier(X, y, n_jobs=2, backend=\"ray\", verbose=2, random_state=1)\n</pre> # Note we already specify the number of cores for parallel execution here atom = ATOMClassifier(X, y, n_jobs=2, backend=\"ray\", verbose=2, random_state=1) <pre>2023-11-04 23:01:00,897\tINFO worker.py:1664 -- Started a local Ray instance. View the dashboard at 127.0.0.1:8265 \n</pre> <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\nParallel processing with 2 cores.\nParallelization backend: ray\n\nDataset stats ==================== &gt;&gt;\nShape: (10000, 11)\nTrain set size: 8000\nTest set size: 2000\n-------------------------------------\nMemory: 880.13 kB\nScaled: True\nOutlier values: 211 (0.2%)\n\n</pre> In\u00a0[4]: Copied! <pre># The ray backend uses modin instead of pandas as data handler\ntype(atom.dataset)\n</pre> # The ray backend uses modin instead of pandas as data handler type(atom.dataset) Out[4]: <pre>pandas.core.frame.DataFrame</pre> In\u00a0[5]: Copied! <pre># Use data cleaning as usual\natom.scale()\n</pre> # Use data cleaning as usual atom.scale() <pre>Fitting Scaler...\nScaling features...\n</pre> In\u00a0[6]: Copied! <pre># Using `parallel=True`, we train one model in each node\n# Note that when training in parallel, the verbosity of the models is zero\natom.run(models=[\"PA\", \"SGD\"], est_params={\"max_iter\": 150}, parallel=True)\n</pre> # Using `parallel=True`, we train one model in each node # Note that when training in parallel, the verbosity of the models is zero atom.run(models=[\"PA\", \"SGD\"], est_params={\"max_iter\": 150}, parallel=True) <pre>\nTraining ========================= &gt;&gt;\nModels: PA, SGD\nMetric: f1\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 9.407s\n-------------------------------------\nPassiveAggressive         --&gt; f1: 0.8165\nStochasticGradientDescent --&gt; f1: 0.8774 !\n</pre> In\u00a0[7]: Copied! <pre># Notice how the summed time to train the models is less than the total time\natom.plot_results(metric=\"time_fit\")\n</pre> # Notice how the summed time to train the models is less than the total time atom.plot_results(metric=\"time_fit\") In\u00a0[8]: Copied! <pre># Create a rest API endpoint and do inference on the holdout set\natom.pa.serve(port=8001)\n</pre> # Create a rest API endpoint and do inference on the holdout set atom.pa.serve(port=8001) In\u00a0[9]: Copied! <pre>import requests\n\nX_predict = atom.X_test.iloc[:10, :]\nresponse = requests.get(\"http://127.0.0.1:8001/\", json=X_predict.to_json())\n</pre> import requests  X_predict = atom.X_test.iloc[:10, :] response = requests.get(\"http://127.0.0.1:8001/\", json=X_predict.to_json()) In\u00a0[10]: Copied! <pre>response.json()\n</pre> response.json() Out[10]: <pre>[1, 1, 0, 0, 1, 1, 0, 1, 0, 0]</pre> In\u00a0[11]: Copied! <pre># Don't forget to shut down the ray server\nray.shutdown()\n</pre> # Don't forget to shut down the ray server ray.shutdown()"}, {"location": "examples/ray_backend/#example-ray-backend", "title": "Example: Ray backend\u00b6", "text": "<p>This example shows how to use the ray backend to train models in a parallel context.</p> <p>The data used is a synthetic dataset created using sklearn's make_classification function.</p>"}, {"location": "examples/ray_backend/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/ray_backend/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/ray_backend/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/regression/", "title": "Regression", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMRegressor\n</pre> # Import packages import pandas as pd from atom import ATOMRegressor In\u00a0[2]: Copied! <pre># Load the data\nX = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load the data X = pd.read_csv(\"docs_source/examples/datasets/abalone.csv\")  # Let's have a look X.head() Out[2]: Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 In\u00a0[3]: Copied! <pre># Initialize atom for regression tasks\natom = ATOMRegressor(X, \"Rings\", verbose=2, random_state=42)\n</pre> # Initialize atom for regression tasks atom = ATOMRegressor(X, \"Rings\", verbose=2, random_state=42) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Regression.\n\nDataset stats ==================== &gt;&gt;\nShape: (4177, 9)\nTrain set size: 3342\nTest set size: 835\n-------------------------------------\nMemory: 300.88 kB\nScaled: False\nCategorical features: 1 (12.5%)\nOutlier values: 195 (0.6%)\n\n</pre> In\u00a0[4]: Copied! <pre># Encode the categorical features\natom.encode()\n</pre> # Encode the categorical features atom.encode() <pre>Fitting Encoder...\nEncoding categorical columns...\n --&gt; OneHot-encoding feature Sex. Contains 3 classes.\n</pre> In\u00a0[5]: Copied! <pre># Plot the dataset's correlation matrix\natom.plot_correlation()\n</pre> # Plot the dataset's correlation matrix atom.plot_correlation() In\u00a0[6]: Copied! <pre># Apply pca for dimensionality reduction\natom.feature_selection(strategy=\"pca\", n_features=6)\n</pre> # Apply pca for dimensionality reduction atom.feature_selection(strategy=\"pca\", n_features=6) <pre>Fitting FeatureSelector...\nPerforming feature selection ...\n --&gt; Applying Principal Component Analysis...\n   --&gt; Scaling features...\n   --&gt; Keeping 6 components.\n   --&gt; Explained variance ratio: 0.97\n</pre> In\u00a0[7]: Copied! <pre># Note that the fetaures are automatically renamed to pca0, pca1, etc...\natom.columns\n</pre> # Note that the fetaures are automatically renamed to pca0, pca1, etc... atom.columns Out[7]: <pre>Index(['pca0', 'pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'Rings'], dtype='object')</pre> In\u00a0[8]: Copied! <pre># Use the plotting methods to see the retained variance ratio\natom.plot_pca()\n</pre> # Use the plotting methods to see the retained variance ratio atom.plot_pca() In\u00a0[9]: Copied! <pre>atom.plot_components()\n</pre> atom.plot_components() In\u00a0[10]: Copied! <pre>atom.run(\n    models=[\"Tree\", \"Bag\", \"ET\"],\n    metric=\"mse\",\n    n_trials=5,\n    n_bootstrap=5,\n)\n</pre> atom.run(     models=[\"Tree\", \"Bag\", \"ET\"],     metric=\"mse\",     n_trials=5,     n_bootstrap=5, ) <pre>\nTraining ========================= &gt;&gt;\nModels: Tree, Bag, ET\nMetric: mse\n\n\nRunning hyperparameter tuning for DecisionTree...\n| trial |   criterion | splitter | max_depth | min_samples_split | min_samples_leaf | max_features | ccp_alpha |     mse | best_mse | time_trial | time_ht |    state |\n| ----- | ----------- | -------- | --------- | ----------------- | ---------------- | ------------ | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     | absolute_.. |     best |         5 |                 8 |               10 |         None |     0.035 | -6.5456 |  -6.5456 |     0.255s |  0.255s | COMPLETE |\n| 1     | squared_e.. |     best |        10 |                 5 |                1 |          0.5 |      0.03 | -7.1959 |  -6.5456 |     0.065s |  0.320s | COMPLETE |\n| 2     | absolute_.. |   random |        14 |                15 |               16 |         sqrt |     0.025 | -8.5859 |  -6.5456 |     0.067s |  0.387s | COMPLETE |\n| 3     | friedman_.. |   random |         4 |                10 |               17 |          0.9 |      0.01 | -7.4933 |  -6.5456 |     0.052s |  0.439s | COMPLETE |\n| 4     |     poisson |     best |        12 |                15 |                8 |          0.6 |      0.02 | -5.8126 |  -5.8126 |     0.066s |  0.505s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 4\nBest parameters:\n --&gt; criterion: poisson\n --&gt; splitter: best\n --&gt; max_depth: 12\n --&gt; min_samples_split: 15\n --&gt; min_samples_leaf: 8\n --&gt; max_features: 0.6\n --&gt; ccp_alpha: 0.02\nBest evaluation --&gt; mse: -5.8126\nTime elapsed: 0.505s\nFit ---------------------------------------------\nTrain evaluation --&gt; mse: -6.2977\nTest evaluation --&gt; mse: -7.1923\nTime elapsed: 0.045s\nBootstrap ---------------------------------------\nEvaluation --&gt; mse: -7.6026 \u00b1 0.3783\nTime elapsed: 0.110s\n-------------------------------------------------\nTime: 0.660s\n\n\nRunning hyperparameter tuning for Bagging...\n| trial | n_estimators | max_samples | max_features | bootstrap | bootstrap_features |     mse | best_mse | time_trial | time_ht |    state |\n| ----- | ------------ | ----------- | ------------ | --------- | ------------------ | ------- | -------- | ---------- | ------- | -------- |\n| 0     |          190 |         1.0 |          0.9 |      True |               True | -4.5751 |  -4.5751 |     5.791s |  5.791s | COMPLETE |\n\nException encountered while running the Bag model.\nMemoryError: could not allocate 187712 bytes\n\n\nRunning hyperparameter tuning for ExtraTrees...\n| trial | n_estimators |     criterion | max_depth | min_samples_split | min_samples_leaf | max_features | bootstrap | max_samples | ccp_alpha |     mse | best_mse | time_trial | time_ht |    state |\n| ----- | ------------ | ------------- | --------- | ----------------- | ---------------- | ------------ | --------- | ----------- | --------- | ------- | -------- | ---------- | ------- | -------- |\n| 0     |          190 | squared_error |         8 |                13 |                3 |          0.5 |      True |         0.6 |     0.025 | -5.1462 |  -5.1462 |     0.285s |  0.285s | COMPLETE |\n| 1     |          230 | absolute_er.. |         8 |                 8 |                8 |         sqrt |      True |         0.6 |       0.0 | -9.3444 |  -5.1462 |     1.377s |  1.662s | COMPLETE |\n| 2     |          180 | absolute_er.. |         7 |                 2 |                3 |          0.6 |      True |         0.6 |      0.03 | -5.7371 |  -5.1462 |     1.738s |  3.400s | COMPLETE |\n| 3     |          100 | squared_error |        14 |                15 |                8 |         None |      True |         0.9 |     0.005 | -5.1938 |  -5.1462 |     0.231s |  3.631s | COMPLETE |\n| 4     |          340 | squared_error |         6 |                15 |                8 |         None |      True |         0.8 |      0.01 | -4.8716 |  -4.8716 |     0.457s |  4.088s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 4\nBest parameters:\n --&gt; n_estimators: 340\n --&gt; criterion: squared_error\n --&gt; max_depth: 6\n --&gt; min_samples_split: 15\n --&gt; min_samples_leaf: 8\n --&gt; max_features: None\n --&gt; bootstrap: True\n --&gt; max_samples: 0.8\n --&gt; ccp_alpha: 0.01\nBest evaluation --&gt; mse: -4.8716\nTime elapsed: 4.088s\nFit ---------------------------------------------\nTrain evaluation --&gt; mse: -5.4808\nTest evaluation --&gt; mse: -6.3445\nTime elapsed: 0.535s\nBootstrap ---------------------------------------\nEvaluation --&gt; mse: -6.3694 \u00b1 0.0737\nTime elapsed: 2.245s\n-------------------------------------------------\nTime: 6.868s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 32.361s\n-------------------------------------\nDecisionTree --&gt; mse: -7.6026 \u00b1 0.3783\nExtraTrees   --&gt; mse: -6.3694 \u00b1 0.0737 !\n</pre> In\u00a0[11]: Copied! <pre># Use the errors or residuals plots to check the model performances\natom.plot_residuals()\n</pre> # Use the errors or residuals plots to check the model performances atom.plot_residuals() In\u00a0[12]: Copied! <pre>atom.plot_errors()\n</pre> atom.plot_errors() In\u00a0[13]: Copied! <pre># Analyze the relation between the target response and the features\natom.plot_partial_dependence(columns=(0, 1, 2, 3))\n</pre> # Analyze the relation between the target response and the features atom.plot_partial_dependence(columns=(0, 1, 2, 3))"}, {"location": "examples/regression/#example-regression", "title": "Example: Regression\u00b6", "text": "<p>This example shows how to use ATOM to apply pca on the data and run a regression pipeline.</p> <p>Download the abalone dataset from https://archive.ics.uci.edu/ml/datasets/Abalone. The goal of this dataset is to predict the rings (age) of abalone shells from physical measurements.</p>"}, {"location": "examples/regression/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/regression/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/regression/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/successive_halving/", "title": "Successive halving", "text": "In\u00a0[1]: Copied! <pre>from sklearn.datasets import fetch_california_housing\nfrom atom import ATOMRegressor\n</pre> from sklearn.datasets import fetch_california_housing from atom import ATOMRegressor In\u00a0[2]: Copied! <pre># Load the data\nX, y = fetch_california_housing(return_X_y=True)\n</pre> # Load the data X, y = fetch_california_housing(return_X_y=True) In\u00a0[3]: Copied! <pre>atom = ATOMRegressor(X, y, verbose=2, random_state=1)\n</pre> atom = ATOMRegressor(X, y, verbose=2, random_state=1) <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Regression.\n\nDataset stats ==================== &gt;&gt;\nShape: (20640, 9)\nTrain set size: 16512\nTest set size: 4128\n-------------------------------------\nMemory: 1.49 MB\nScaled: False\nOutlier values: 786 (0.5%)\n\n</pre> In\u00a0[4]: Copied! <pre># Compare tree-based models via successive halving\natom.successive_halving(\n    models=[\"Tree\", \"Bag\", \"ET\", \"RF\", \"LGB\", \"CatB\"],\n    metric=\"mae\",\n    n_bootstrap=5,\n)\n</pre> # Compare tree-based models via successive halving atom.successive_halving(     models=[\"Tree\", \"Bag\", \"ET\", \"RF\", \"LGB\", \"CatB\"],     metric=\"mae\",     n_bootstrap=5, ) <pre>\nTraining ========================= &gt;&gt;\nMetric: mae\n\n\nRun: 0 =========================== &gt;&gt;\nModels: Tree6, Bag6, ET6, RF6, LGB6, CatB6\nSize of training set: 16512 (17%)\nSize of test set: 4128\n\n\nResults for DecisionTree:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.0\nTest evaluation --&gt; mae: -0.5394\nTime elapsed: 0.103s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.576 \u00b1 0.0119\nTime elapsed: 0.422s\n-------------------------------------------------\nTime: 0.525s\n\n\nResults for Bagging:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.1715\nTest evaluation --&gt; mae: -0.4308\nTime elapsed: 0.450s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.435 \u00b1 0.0059\nTime elapsed: 2.061s\n-------------------------------------------------\nTime: 2.511s\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.0\nTest evaluation --&gt; mae: -0.3977\nTime elapsed: 1.574s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.4059 \u00b1 0.0028\nTime elapsed: 7.107s\n-------------------------------------------------\nTime: 8.681s\n\n\nResults for RandomForest:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.1508\nTest evaluation --&gt; mae: -0.4053\nTime elapsed: 4.178s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.4162 \u00b1 0.0031\nTime elapsed: 18.156s\n-------------------------------------------------\nTime: 22.335s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.2031\nTest evaluation --&gt; mae: -0.3594\nTime elapsed: 0.438s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.3673 \u00b1 0.0016\nTime elapsed: 0.886s\n-------------------------------------------------\nTime: 1.324s\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.1621\nTest evaluation --&gt; mae: -0.3483\nTime elapsed: 5.084s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.3554 \u00b1 0.0025\nTime elapsed: 20.177s\n-------------------------------------------------\nTime: 25.261s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 01m:01s\n-------------------------------------\nDecisionTree --&gt; mae: -0.576 \u00b1 0.0119 ~\nBagging      --&gt; mae: -0.435 \u00b1 0.0059 ~\nExtraTrees   --&gt; mae: -0.4059 \u00b1 0.0028 ~\nRandomForest --&gt; mae: -0.4162 \u00b1 0.0031 ~\nLightGBM     --&gt; mae: -0.3673 \u00b1 0.0016 ~\nCatBoost     --&gt; mae: -0.3554 \u00b1 0.0025 ~ !\n\n\nRun: 1 =========================== &gt;&gt;\nModels: ET3, LGB3, CatB3\nSize of training set: 16512 (33%)\nSize of test set: 4128\n\n\nResults for ExtraTrees:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.0\nTest evaluation --&gt; mae: -0.3739\nTime elapsed: 2.738s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.3841 \u00b1 0.0027\nTime elapsed: 11.259s\n-------------------------------------------------\nTime: 13.997s\n\n\nResults for LightGBM:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.2327\nTest evaluation --&gt; mae: -0.3356\nTime elapsed: 0.389s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.345 \u00b1 0.0037\nTime elapsed: 0.876s\n-------------------------------------------------\nTime: 1.265s\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.1882\nTest evaluation --&gt; mae: -0.3255\nTime elapsed: 4.800s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.3352 \u00b1 0.0023\nTime elapsed: 22.708s\n-------------------------------------------------\nTime: 27.509s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 43.130s\n-------------------------------------\nExtraTrees --&gt; mae: -0.3841 \u00b1 0.0027 ~\nLightGBM   --&gt; mae: -0.345 \u00b1 0.0037 ~\nCatBoost   --&gt; mae: -0.3352 \u00b1 0.0023 ~ !\n\n\nRun: 2 =========================== &gt;&gt;\nModels: CatB1\nSize of training set: 16512 (100%)\nSize of test set: 4128\n\n\nResults for CatBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; mae: -0.2229\nTest evaluation --&gt; mae: -0.2986\nTime elapsed: 6.851s\nBootstrap ---------------------------------------\nEvaluation --&gt; mae: -0.3091 \u00b1 0.0026\nTime elapsed: 33.428s\n-------------------------------------------------\nTime: 40.279s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 40.375s\n-------------------------------------\nCatBoost --&gt; mae: -0.3091 \u00b1 0.0026 ~\n</pre> In\u00a0[5]: Copied! <pre># The results is now multi-index, where frac is the fraction\n# of the training set used to fit the model. The model names\n# end with the number of models fitted during that run\natom.results\n</pre> # The results is now multi-index, where frac is the fraction # of the training set used to fit the model. The model names # end with the number of models fitted during that run atom.results Out[5]: mae_train mae_test time_fit mae_bootstrap time_bootstrap time frac model 0.17 Bag6 -0.2017 -0.4327 0.450035 -0.434981 2.061373 2.511408 CatB6 -0.2065 -0.3557 5.083625 -0.355352 20.176994 25.260619 ET6 -0.0694 -0.4077 1.574000 -0.405855 7.106890 8.680890 LGB6 -0.2202 -0.3676 0.438399 -0.367271 0.885806 1.324205 RF6 -0.1851 -0.4165 4.178345 -0.416217 18.156310 22.334655 Tree6 -0.1039 -0.5897 0.102987 -0.575962 0.422224 0.525211 0.33 CatB3 -0.2249 -0.3384 4.800246 -0.335246 22.708465 27.508711 ET3 -0.0935 -0.3879 2.738315 -0.384081 11.258794 13.997109 LGB3 -0.2489 -0.3405 0.389353 -0.344951 0.875797 1.265150 1.00 CatB1 -0.2447 -0.3066 6.851350 -0.309112 33.428059 40.279409 In\u00a0[6]: Copied! <pre># Plot the successive halving's results\natom.plot_successive_halving()\n</pre> # Plot the successive halving's results atom.plot_successive_halving() In\u00a0[7]: Copied! <pre># Use regex to call all the models with the same estimator...\natom.plot_errors(models=[\"CatB.*\"])\n</pre> # Use regex to call all the models with the same estimator... atom.plot_errors(models=[\"CatB.*\"]) In\u00a0[8]: Copied! <pre># ...or to call the models from the same run\natom.plot_errors(models=\".*3\")\n</pre> # ...or to call the models from the same run atom.plot_errors(models=\".*3\")"}, {"location": "examples/successive_halving/#example-successive-halving", "title": "Example: Successive halving\u00b6", "text": "<p>This example shows how to compare multiple tree-based models using successive halving.</p> <p>Import the california housing dataset from sklearn.datasets. This is a small and easy to train dataset whose goal is to predict house prices.</p>"}, {"location": "examples/successive_halving/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/successive_halving/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/successive_halving/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/train_sizing/", "title": "Train sizing", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport pandas as pd\nfrom atom import ATOMClassifier\n</pre> # Import packages import pandas as pd from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load the data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load the data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre># Initialize atom and prepare the data\natom = ATOMClassifier(X, verbose=2, random_state=1)\natom.clean()\natom.impute(strat_num=\"median\", strat_cat=\"most_frequent\", max_nan_rows=0.8)\natom.encode()\n</pre> # Initialize atom and prepare the data atom = ATOMClassifier(X, verbose=2, random_state=1) atom.clean() atom.impute(strat_num=\"median\", strat_cat=\"most_frequent\", max_nan_rows=0.8) atom.encode() <pre>&lt;&lt; ================== ATOM ================== &gt;&gt;\n\nConfiguration ==================== &gt;&gt;\nAlgorithm task: Binary classification.\n\nDataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 25.03 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n\nFitting Cleaner...\nCleaning the data...\nFitting Imputer...\nImputing missing values...\n --&gt; Dropping 161 samples for containing more than 16 missing values.\n --&gt; Imputing 481 missing values with median (12.0) in feature MinTemp.\n --&gt; Imputing 265 missing values with median (22.6) in feature MaxTemp.\n --&gt; Imputing 1354 missing values with median (0.0) in feature Rainfall.\n --&gt; Imputing 60682 missing values with median (4.8) in feature Evaporation.\n --&gt; Imputing 67659 missing values with median (8.4) in feature Sunshine.\n --&gt; Imputing 9187 missing values with most_frequent (W) in feature WindGustDir.\n --&gt; Imputing 9127 missing values with median (39.0) in feature WindGustSpeed.\n --&gt; Imputing 9852 missing values with most_frequent (N) in feature WindDir9am.\n --&gt; Imputing 3617 missing values with most_frequent (SE) in feature WindDir3pm.\n --&gt; Imputing 1187 missing values with median (13.0) in feature WindSpeed9am.\n --&gt; Imputing 2469 missing values with median (19.0) in feature WindSpeed3pm.\n --&gt; Imputing 1613 missing values with median (70.0) in feature Humidity9am.\n --&gt; Imputing 3449 missing values with median (52.0) in feature Humidity3pm.\n --&gt; Imputing 13863 missing values with median (1017.6) in feature Pressure9am.\n --&gt; Imputing 13830 missing values with median (1015.2) in feature Pressure3pm.\n --&gt; Imputing 53496 missing values with median (5.0) in feature Cloud9am.\n --&gt; Imputing 56933 missing values with median (5.0) in feature Cloud3pm.\n --&gt; Imputing 743 missing values with median (16.7) in feature Temp9am.\n --&gt; Imputing 2565 missing values with median (21.1) in feature Temp3pm.\n --&gt; Imputing 1354 missing values with most_frequent (No) in feature RainToday.\nFitting Encoder...\nEncoding categorical columns...\n --&gt; Target-encoding feature Location. Contains 49 classes.\n --&gt; Target-encoding feature WindGustDir. Contains 16 classes.\n --&gt; Target-encoding feature WindDir9am. Contains 16 classes.\n --&gt; Target-encoding feature WindDir3pm. Contains 16 classes.\n --&gt; Ordinal-encoding feature RainToday. Contains 2 classes.\n</pre> In\u00a0[4]: Copied! <pre># Analyze the impact of the training set's size on a LR model\natom.train_sizing(\"LR\", train_sizes=10, n_bootstrap=5)\n</pre> # Analyze the impact of the training set's size on a LR model atom.train_sizing(\"LR\", train_sizes=10, n_bootstrap=5) <pre>\nTraining ========================= &gt;&gt;\nMetric: f1\n\n\nRun: 0 =========================== &gt;&gt;\nModels: LR01\nSize of training set: 11362 (10%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5624\nTest evaluation --&gt; f1: 0.5857\nTime elapsed: 0.721s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.585 \u00b1 0.0021\nTime elapsed: 0.729s\n-------------------------------------------------\nTime: 1.449s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 2.053s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.585 \u00b1 0.0021\n\n\nRun: 1 =========================== &gt;&gt;\nModels: LR02\nSize of training set: 22724 (20%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.582\nTest evaluation --&gt; f1: 0.5874\nTime elapsed: 0.853s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5851 \u00b1 0.002\nTime elapsed: 0.865s\n-------------------------------------------------\nTime: 1.718s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 2.425s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5851 \u00b1 0.002\n\n\nRun: 2 =========================== &gt;&gt;\nModels: LR03\nSize of training set: 34087 (30%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5812\nTest evaluation --&gt; f1: 0.585\nTime elapsed: 1.086s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5861 \u00b1 0.0009\nTime elapsed: 1.119s\n-------------------------------------------------\nTime: 2.205s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 3.035s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5861 \u00b1 0.0009\n\n\nRun: 3 =========================== &gt;&gt;\nModels: LR04\nSize of training set: 45449 (40%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5828\nTest evaluation --&gt; f1: 0.5862\nTime elapsed: 1.173s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5863 \u00b1 0.0017\nTime elapsed: 1.282s\n-------------------------------------------------\nTime: 2.455s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 3.365s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5863 \u00b1 0.0017\n\n\nRun: 4 =========================== &gt;&gt;\nModels: LR05\nSize of training set: 56812 (50%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5823\nTest evaluation --&gt; f1: 0.5853\nTime elapsed: 1.264s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.585 \u00b1 0.0016\nTime elapsed: 1.460s\n-------------------------------------------------\nTime: 2.724s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 3.758s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.585 \u00b1 0.0016\n\n\nRun: 5 =========================== &gt;&gt;\nModels: LR06\nSize of training set: 68174 (60%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5835\nTest evaluation --&gt; f1: 0.5843\nTime elapsed: 1.392s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.585 \u00b1 0.0016\nTime elapsed: 1.704s\n-------------------------------------------------\nTime: 3.095s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 4.151s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.585 \u00b1 0.0016\n\n\nRun: 6 =========================== &gt;&gt;\nModels: LR07\nSize of training set: 79536 (70%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5872\nTest evaluation --&gt; f1: 0.5846\nTime elapsed: 1.585s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5852 \u00b1 0.0013\nTime elapsed: 1.836s\n-------------------------------------------------\nTime: 3.421s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 4.664s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5852 \u00b1 0.0013\n\n\nRun: 7 =========================== &gt;&gt;\nModels: LR08\nSize of training set: 90899 (80%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5889\nTest evaluation --&gt; f1: 0.5841\nTime elapsed: 1.693s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5852 \u00b1 0.0025\nTime elapsed: 2.139s\n-------------------------------------------------\nTime: 3.832s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.157s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5852 \u00b1 0.0025\n\n\nRun: 8 =========================== &gt;&gt;\nModels: LR09\nSize of training set: 102261 (90%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5871\nTest evaluation --&gt; f1: 0.5837\nTime elapsed: 1.754s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5844 \u00b1 0.0022\nTime elapsed: 2.353s\n-------------------------------------------------\nTime: 4.107s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.464s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5844 \u00b1 0.0022\n\n\nRun: 9 =========================== &gt;&gt;\nModels: LR10\nSize of training set: 113624 (100%)\nSize of test set: 28408\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f1: 0.5856\nTest evaluation --&gt; f1: 0.585\nTime elapsed: 1.978s\nBootstrap ---------------------------------------\nEvaluation --&gt; f1: 0.5846 \u00b1 0.0005\nTime elapsed: 2.544s\n-------------------------------------------------\nTime: 4.521s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 5.975s\n-------------------------------------\nLogisticRegression --&gt; f1: 0.5846 \u00b1 0.0005\n</pre> In\u00a0[5]: Copied! <pre># The results are now multi-index, where frac is the fraction\n# of the training set used to fit the model. The model names\n# end with the fraction as well (without the dot)\natom.results\n</pre> # The results are now multi-index, where frac is the fraction # of the training set used to fit the model. The model names # end with the fraction as well (without the dot) atom.results Out[5]: f1_train f1_test time_fit f1_bootstrap time_bootstrap time frac model 0.1 LR01 0.5622 0.5852 0.720655 0.585044 0.728664 1.449319 0.2 LR02 0.5830 0.5845 0.852776 0.585144 0.864794 1.717570 0.3 LR03 0.5795 0.5856 1.085709 0.586101 1.119410 2.205119 0.4 LR04 0.5847 0.5858 1.173066 0.586305 1.282166 2.455232 0.5 LR05 0.5836 0.5862 1.264150 0.585003 1.460329 2.724479 0.6 LR06 0.5832 0.5833 1.391943 0.584966 1.703550 3.095493 0.7 LR07 0.5880 0.5856 1.585444 0.585199 1.835532 3.420976 0.8 LR08 0.5914 0.5882 1.693054 0.585235 2.138652 3.831706 0.9 LR09 0.5854 0.5828 1.753595 0.584420 2.353141 4.106736 1.0 LR10 0.5862 0.5850 1.977799 0.584634 2.543574 4.521373 In\u00a0[6]: Copied! <pre># Every model can be accessed through its name\natom.lr05.plot_shap_waterfall(show=6)\n</pre> # Every model can be accessed through its name atom.lr05.plot_shap_waterfall(show=6) In\u00a0[7]: Copied! <pre># Plot the train sizing's results\natom.plot_learning_curve()\n</pre> # Plot the train sizing's results atom.plot_learning_curve()"}, {"location": "examples/train_sizing/#example-train-sizing", "title": "Example: Train sizing\u00b6", "text": "<p>This example shows how to asses a model's performance based on the size of the training set.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/train_sizing/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/train_sizing/#run-the-pipeline", "title": "Run the pipeline\u00b6", "text": ""}, {"location": "examples/train_sizing/#analyze-the-results", "title": "Analyze the results\u00b6", "text": ""}, {"location": "examples/utilities/", "title": "Utilities", "text": "In\u00a0[1]: Copied! <pre># Import packages\nimport tempfile\nimport pandas as pd\nfrom sklearn.metrics import fbeta_score\nfrom atom import ATOMClassifier\n</pre> # Import packages import tempfile import pandas as pd from sklearn.metrics import fbeta_score from atom import ATOMClassifier In\u00a0[2]: Copied! <pre># Load data\nX = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")\n\n# Let's have a look\nX.head()\n</pre> # Load data X = pd.read_csv(\"docs_source/examples/datasets/weatherAUS.csv\")  # Let's have a look X.head() Out[2]: Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RainTomorrow 0 MelbourneAirport 18.0 26.9 21.4 7.0 8.9 SSE 41.0 W SSE ... 95.0 54.0 1019.5 1017.0 8.0 5.0 18.5 26.0 Yes 0 1 Adelaide 17.2 23.4 0.0 NaN NaN S 41.0 S WSW ... 59.0 36.0 1015.7 1015.7 NaN NaN 17.7 21.9 No 0 2 Cairns 18.6 24.6 7.4 3.0 6.1 SSE 54.0 SSE SE ... 78.0 57.0 1018.7 1016.6 3.0 3.0 20.8 24.1 Yes 0 3 Portland 13.6 16.8 4.2 1.2 0.0 ESE 39.0 ESE ESE ... 76.0 74.0 1021.4 1020.5 7.0 8.0 15.6 16.0 Yes 1 4 Walpole 16.4 19.9 0.0 NaN NaN SE 44.0 SE SE ... 78.0 70.0 1019.4 1018.9 NaN NaN 17.4 18.1 No 0 <p>5 rows \u00d7 22 columns</p> In\u00a0[3]: Copied! <pre>atom = ATOMClassifier(X, random_state=1)\natom.clean()\n\n# Quickly check what columns have missing values\nprint(f\"Columns with missing values:\\n{atom.nans}\")\n\n# Or what columns are categorical\nprint(f\"\\nCategorical columns: {atom.categorical}\")\n\n# Or if the dataset is scaled\nprint(f\"\\nIs the dataset scaled? {atom.scaled}\")\n</pre> atom = ATOMClassifier(X, random_state=1) atom.clean()  # Quickly check what columns have missing values print(f\"Columns with missing values:\\n{atom.nans}\")  # Or what columns are categorical print(f\"\\nCategorical columns: {atom.categorical}\")  # Or if the dataset is scaled print(f\"\\nIs the dataset scaled? {atom.scaled}\") <pre>Columns with missing values:\nLocation             0\nMinTemp            637\nMaxTemp            322\nRainfall          1406\nEvaporation      60843\nSunshine         67816\nWindGustDir       9330\nWindGustSpeed     9270\nWindDir9am       10013\nWindDir3pm        3778\nWindSpeed9am      1348\nWindSpeed3pm      2630\nHumidity9am       1774\nHumidity3pm       3610\nPressure9am      14014\nPressure3pm      13981\nCloud9am         53657\nCloud3pm         57094\nTemp9am            904\nTemp3pm           2726\nRainToday         1406\ndtype: int64\n\nCategorical columns: Index(['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday'], dtype='object')\n\nIs the dataset scaled? False\n</pre> In\u00a0[4]: Copied! <pre># Note the number of missing values and categorical columns\natom.stats()\n</pre> # Note the number of missing values and categorical columns atom.stats() <pre>Dataset stats ==================== &gt;&gt;\nShape: (142193, 22)\nTrain set size: 113755\nTest set size: 28438\n-------------------------------------\nMemory: 27.44 MB\nScaled: False\nMissing values: 316559 (10.1%)\nCategorical features: 5 (23.8%)\nDuplicates: 45 (0.0%)\n</pre> In\u00a0[5]: Copied! <pre># Now, let's impute and encode the dataset...\natom.impute()\natom.encode()\n\n# ... and the values are gone\natom.stats()\n</pre> # Now, let's impute and encode the dataset... atom.impute() atom.encode()  # ... and the values are gone atom.stats() <pre>Dataset stats ==================== &gt;&gt;\nShape: (56420, 22)\nTrain set size: 45075\nTest set size: 11345\n-------------------------------------\nMemory: 11.11 MB\nScaled: False\nOutlier values: 3203 (0.3%)\n</pre> In\u00a0[6]: Copied! <pre># Compare the relationship of multiple columns with a scatter maxtrix\natom.plot_relationships(columns=slice(0, 5))\n</pre> # Compare the relationship of multiple columns with a scatter maxtrix atom.plot_relationships(columns=slice(0, 5)) In\u00a0[7]: Copied! <pre># Check which distribution fits a column best\natom.distribution(columns=\"Rainfall\")\n</pre> # Check which distribution fits a column best atom.distribution(columns=\"Rainfall\") Out[7]: Rainfall dist stat beta score 0.6506 p_value 0.0 expon score 0.6506 p_value 0.0 gamma score 0.6465 p_value 0.0 invgauss score 0.6257 p_value 0.0 lognorm score 0.6485 p_value 0.0 norm score 0.3807 p_value 0.0 pearson3 score 0.6506 p_value 0.0 triang score 0.7191 p_value 0.0 uniform score 0.8914 p_value 0.0 weibull_min score 0.6506 p_value 0.0 weibull_max score 0.8896 p_value 0.0 In\u00a0[8]: Copied! <pre># Investigate a column's distribution\natom.plot_distribution(columns=\"MinTemp\", distributions=\"beta\")\natom.plot_qq(columns=\"MinTemp\", distributions=\"beta\")\n</pre> # Investigate a column's distribution atom.plot_distribution(columns=\"MinTemp\", distributions=\"beta\") atom.plot_qq(columns=\"MinTemp\", distributions=\"beta\") <p>There are two ways to quickly transform the dataset mid-pipeline. The first way is through the property's <code>@setter</code>. The downside for this approach is that the transformation is not stored in atom's pipeline, so the transformation is not applied on new data. Therefore, we recommend using the second approach, through the add method.</p> In\u00a0[9]: Copied! <pre># Note that we can only replace a dataframe with a new dataframe!\natom.X = atom.X.assign(AvgTemp=(atom.X[\"MaxTemp\"] + atom.X[\"MinTemp\"])/2)\n\n# This will automatically update all other data attributes\nassert \"AvgTemp\" in atom\n\n# But it's not saved to atom's pipeline\natom.pipeline\n</pre> # Note that we can only replace a dataframe with a new dataframe! atom.X = atom.X.assign(AvgTemp=(atom.X[\"MaxTemp\"] + atom.X[\"MinTemp\"])/2)  # This will automatically update all other data attributes assert \"AvgTemp\" in atom  # But it's not saved to atom's pipeline atom.pipeline Out[9]: <pre>Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare'))])</pre>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.Pipeline<pre>Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare'))])</pre>Cleaner<pre>Cleaner()</pre>Imputer<pre>Imputer()</pre>Encoder<pre>Encoder(value='rare')</pre> In\u00a0[10]: Copied! <pre># Same transformation, different approach (AvgTemp is overwritten)\ndef transform(df):\n    df[\"AvgTemp\"] = (df.MaxTemp + df.MinTemp) / 2\n    return df\n\natom.apply(transform)\n\nassert \"AvgTemp\" in atom\n</pre> # Same transformation, different approach (AvgTemp is overwritten) def transform(df):     df[\"AvgTemp\"] = (df.MaxTemp + df.MinTemp) / 2     return df  atom.apply(transform)  assert \"AvgTemp\" in atom In\u00a0[11]: Copied! <pre># Now the function appears in the pipeline\natom.pipeline\n</pre> # Now the function appears in the pipeline atom.pipeline Out[11]: <pre>Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare')),\n                ('FunctionTransformer',\n                 FunctionTransformer(func=&lt;function transform at 0x0000016745DF6B90&gt;))])</pre>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.Pipeline<pre>Pipeline(memory=Memory(location=None),\n         steps=[('Cleaner', Cleaner()), ('Imputer', Imputer()),\n                ('Encoder', Encoder(value='rare')),\n                ('FunctionTransformer',\n                 FunctionTransformer(func=&lt;function transform at 0x0000016745DF6B90&gt;))])</pre>Cleaner<pre>Cleaner()</pre>Imputer<pre>Imputer()</pre>Encoder<pre>Encoder(value='rare')</pre>FunctionTransformer<pre>FunctionTransformer(func=&lt;function transform at 0x0000016745DF6B90&gt;)</pre> In\u00a0[12]: Copied! <pre>atom.available_models()\n</pre> atom.available_models() Out[12]: acronym model estimator module needs_scaling accepts_sparse native_multilabel native_multioutput has_validation supports_engines 0 AdaB AdaBoost AdaBoostClassifier sklearnensemble False True False False False sklearn 1 Bag Bagging BaggingClassifier sklearnensemble False True False False False sklearn 2 BNB BernoulliNB BernoulliNB sklearnnaive_bayes False True False False False sklearn, cuml 3 CatB CatBoost CatBoostClassifier catboostcatboost True True False False True catboost 4 CatNB CategoricalNB CategoricalNB sklearnnaive_bayes False True False False False sklearn, cuml 5 CNB ComplementNB ComplementNB sklearnnaive_bayes False True False False False sklearn, cuml 6 Tree DecisionTree DecisionTreeClassifier sklearntree False True True True False sklearn 7 Dummy Dummy DummyClassifier sklearndummy False False False False False sklearn 8 ETree ExtraTree ExtraTreeClassifier sklearntree False True True True False sklearn 9 ET ExtraTrees ExtraTreesClassifier sklearnensemble False True True True False sklearn 10 GNB GaussianNB GaussianNB sklearnnaive_bayes False False False False False sklearn, cuml 11 GP GaussianProcess GaussianProcessClassifier sklearngaussian_process False False False False False sklearn 12 GBM GradientBoostingMachine GradientBoostingClassifier sklearnensemble False True False False False sklearn 13 hGBM HistGradientBoosting HistGradientBoostingClassifier sklearnensemble False False False False False sklearn 14 KNN KNearestNeighbors KNeighborsClassifier sklearnneighbors True True True True False sklearn, sklearnex, cuml 15 LGB LightGBM LGBMClassifier lightgbmlightgbm.sklearn True True False False True lightgbm 16 LDA LinearDiscriminantAnalysis LinearDiscriminantAnalysis sklearndiscriminant_analysis False False False False False sklearn 17 lSVM LinearSVM LinearSVC sklearnsvm True True False False False sklearn, cuml 18 LR LogisticRegression LogisticRegression sklearnlinear_model True True False False False sklearn, sklearnex, cuml 19 MLP MultiLayerPerceptron MLPClassifier sklearnneural_network True True True False True sklearn 20 MNB MultinomialNB MultinomialNB sklearnnaive_bayes False True False False False sklearn, cuml 21 PA PassiveAggressive PassiveAggressiveClassifier sklearnlinear_model True True False False True sklearn 22 Perc Perceptron Perceptron sklearnlinear_model True False False False True sklearn 23 QDA QuadraticDiscriminantAnalysis QuadraticDiscriminantAnalysis sklearndiscriminant_analysis False False False False False sklearn 24 RNN RadiusNearestNeighbors RadiusNeighborsClassifier sklearnneighbors True True True True False sklearn 25 RF RandomForest RandomForestClassifier sklearnensemble False True True True False sklearn, sklearnex, cuml 26 Ridge Ridge RidgeClassifier sklearnlinear_model True True True False False sklearn, sklearnex, cuml 27 SGD StochasticGradientDescent SGDClassifier sklearnlinear_model True True False False True sklearn 28 SVM SupportVectorMachine SVC sklearnsvm True True False False False sklearn, sklearnex, cuml 29 XGB XGBoost XGBClassifier xgboostxgboost True True False False True xgboost In\u00a0[13]: Copied! <pre>atom.verbose = 1\n\n# Define a custom metric\ndef f2(y_true, y_pred):\n    return fbeta_score(y_true, y_pred, beta=2)\n\n# Use the greater_is_better, needs_proba and needs_threshold parameters if necessary\natom.run(models=\"LR\", metric=f2)\n</pre> atom.verbose = 1  # Define a custom metric def f2(y_true, y_pred):     return fbeta_score(y_true, y_pred, beta=2)  # Use the greater_is_better, needs_proba and needs_threshold parameters if necessary atom.run(models=\"LR\", metric=f2) <pre>\nTraining ========================= &gt;&gt;\nModels: LR\nMetric: f2\n\n\nResults for LogisticRegression:\nFit ---------------------------------------------\nTrain evaluation --&gt; f2: 0.5693\nTest evaluation --&gt; f2: 0.5709\nTime elapsed: 0.863s\n-------------------------------------------------\nTime: 0.863s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 1.491s\n-------------------------------------\nLogisticRegression --&gt; f2: 0.5709\n</pre> In\u00a0[14]: Copied! <pre># You can use the est_params parameter to customize the estimator\n# Let's run AdaBoost using LR instead of a decision tree as base estimator\natom.run(\"AdaB\", est_params={\"base_estimator\": atom.lr.estimator})\n</pre> # You can use the est_params parameter to customize the estimator # Let's run AdaBoost using LR instead of a decision tree as base estimator atom.run(\"AdaB\", est_params={\"base_estimator\": atom.lr.estimator}) <pre>\nTraining ========================= &gt;&gt;\nModels: AdaB\nMetric: f2\n\n\nResults for AdaBoost:\nFit ---------------------------------------------\nTrain evaluation --&gt; f2: 0.556\nTest evaluation --&gt; f2: 0.5636\nTime elapsed: 2.568s\n-------------------------------------------------\nTime: 2.568s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 3.065s\n-------------------------------------\nAdaBoost --&gt; f2: 0.5636\n</pre> In\u00a0[15]: Copied! <pre>atom.adab.estimator\n</pre> atom.adab.estimator Out[15]: <pre>AdaBoostClassifier(base_estimator=LogisticRegression(n_jobs=1, random_state=1),\n                   random_state=1)</pre>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.AdaBoostClassifier<pre>AdaBoostClassifier(base_estimator=LogisticRegression(n_jobs=1, random_state=1),\n                   random_state=1)</pre>base_estimator: LogisticRegression<pre>LogisticRegression(n_jobs=1, random_state=1)</pre>LogisticRegression<pre>LogisticRegression(n_jobs=1, random_state=1)</pre> In\u00a0[16]: Copied! <pre># Note that parameters specified by est_params are not optimized in the BO\natom.run(\n    models=\"Tree\",\n    n_trials=10,\n    est_params={\n        \"criterion\": \"gini\",\n        \"splitter\": \"best\",\n        \"min_samples_leaf\": 1,\n        \"ccp_alpha\": 0.035,\n    },\n    verbose=2,\n)\n</pre> # Note that parameters specified by est_params are not optimized in the BO atom.run(     models=\"Tree\",     n_trials=10,     est_params={         \"criterion\": \"gini\",         \"splitter\": \"best\",         \"min_samples_leaf\": 1,         \"ccp_alpha\": 0.035,     },     verbose=2, ) <pre>\nTraining ========================= &gt;&gt;\nModels: Tree\nMetric: f2\n\n\nRunning hyperparameter tuning for DecisionTree...\n| trial | max_depth | min_samples_split | max_features |      f2 | best_f2 | time_trial | time_ht |    state |\n| ----- | --------- | ----------------- | ------------ | ------- | ------- | ---------- | ------- | -------- |\n| 0     |        13 |                12 |          0.5 |  0.4362 |  0.4362 |     3.161s |  3.161s | COMPLETE |\n| 1     |        14 |                16 |         log2 |  0.4729 |  0.4729 |     2.872s |  6.033s | COMPLETE |\n| 2     |        16 |                13 |          0.8 |  0.4626 |  0.4729 |     3.201s |  9.234s | COMPLETE |\n| 3     |         9 |                 6 |         None |  0.4903 |  0.4903 |     3.075s | 12.309s | COMPLETE |\n| 4     |         5 |                 2 |         log2 |  0.4889 |  0.4903 |     2.812s | 15.121s | COMPLETE |\n| 5     |         1 |                15 |          0.5 |  0.4953 |  0.4953 |     2.827s | 17.948s | COMPLETE |\n| 6     |        15 |                 9 |         sqrt |  0.5004 |  0.5004 |     2.951s | 20.899s | COMPLETE |\n| 7     |        13 |                20 |         None |  0.5004 |  0.5004 |     3.242s | 24.141s | COMPLETE |\n| 8     |         3 |                19 |          0.5 |  0.4936 |  0.5004 |     2.800s | 26.941s | COMPLETE |\n| 9     |        15 |                20 |         sqrt |  0.4762 |  0.5004 |     3.170s | 30.111s | COMPLETE |\nHyperparameter tuning ---------------------------\nBest trial --&gt; 6\nBest parameters:\n --&gt; max_depth: 15\n --&gt; min_samples_split: 9\n --&gt; max_features: sqrt\nBest evaluation --&gt; f2: 0.5004\nTime elapsed: 30.111s\nFit ---------------------------------------------\nTrain evaluation --&gt; f2: 0.4925\nTest evaluation --&gt; f2: 0.4925\nTime elapsed: 0.452s\n-------------------------------------------------\nTime: 30.563s\n\n\nFinal results ==================== &gt;&gt;\nTotal time: 30.885s\n-------------------------------------\nDecisionTree --&gt; f2: 0.4925\n</pre> <p>Note that both instances need to be initialized with the same data and use the same metric for model training to be able to merge.</p> In\u00a0[17]: Copied! <pre>tempdir = tempfile.gettempdir()\n</pre> tempdir = tempfile.gettempdir() In\u00a0[18]: Copied! <pre># Save the atom instance as a pickle\n# Use save_data=False to save the instance without the data\natom.save(tempdir + \"atom\", save_data=False)\n</pre> # Save the atom instance as a pickle # Use save_data=False to save the instance without the data atom.save(tempdir + \"atom\", save_data=False) <pre>ATOMClassifier successfully saved.\n</pre> In\u00a0[20]: Copied! <pre># No need to store the transformed data, providing the original dataset to\n# the loader automatically transforms it through all the steps in the pipeline\natom_2 = ATOMClassifier.load(tempdir + \"atom\", data=(X,))\n</pre> # No need to store the transformed data, providing the original dataset to # the loader automatically transforms it through all the steps in the pipeline atom_2 = ATOMClassifier.load(tempdir + \"atom\", data=(X,)) <pre>ATOMClassifier successfully loaded.\n</pre> In\u00a0[21]: Copied! <pre># Create a separate instance with its own branch and model\natom_3 = ATOMClassifier(X, verbose=0, random_state=1)\natom_3.branch.name = \"lightgbm\"\natom_3.impute()\natom_3.encode()\natom_3.run(\"LGB\", metric=f2)\n</pre> # Create a separate instance with its own branch and model atom_3 = ATOMClassifier(X, verbose=0, random_state=1) atom_3.branch.name = \"lightgbm\" atom_3.impute() atom_3.encode() atom_3.run(\"LGB\", metric=f2) In\u00a0[22]: Copied! <pre># Merge the instances\natom_2.merge(atom_3)\n</pre> # Merge the instances atom_2.merge(atom_3) <pre>Merging instances...\n --&gt; Merging branch lightgbm.\n --&gt; Merging model LGB.\n --&gt; Merging attributes.\n</pre> In\u00a0[23]: Copied! <pre># Note that it now contains both branches and all models\natom_2\n</pre> # Note that it now contains both branches and all models atom_2 Out[23]: <pre>ATOMClassifier\n --&gt; Branches:\n   --&gt; main !\n   --&gt; lightgbm\n --&gt; Models: LR, AdaB, Tree, LGB\n --&gt; Metric: f2</pre> In\u00a0[24]: Copied! <pre>atom_2.results\n</pre> atom_2.results Out[24]: f2_train f2_test time_fit time frac model 0.8 AdaB 0.5599 0.5590 2.568021 2.568021 LR 0.5723 0.5685 0.863496 0.863496 Tree 0.4930 0.4928 0.452411 30.563017 1.0 LGB 0.6578 0.5909 3.991159 3.991159"}, {"location": "examples/utilities/#example-utilities", "title": "Example: Utilities\u00b6", "text": "<p>This example shows various useful utilities that can be used to improve atom's pipelines.</p> <p>The data used is a variation on the Australian weather dataset from Kaggle. You can download it from here. The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target <code>RainTomorrow</code>.</p>"}, {"location": "examples/utilities/#load-the-data", "title": "Load the data\u00b6", "text": ""}, {"location": "examples/utilities/#use-the-utility-attributes", "title": "Use the utility attributes\u00b6", "text": ""}, {"location": "examples/utilities/#use-the-stats-method-to-assess-changes-in-the-dataset", "title": "Use the stats method to assess changes in the dataset\u00b6", "text": ""}, {"location": "examples/utilities/#inspect-feature-distributions", "title": "Inspect feature distributions\u00b6", "text": ""}, {"location": "examples/utilities/#change-the-data-mid-pipeline", "title": "Change the data mid-pipeline\u00b6", "text": ""}, {"location": "examples/utilities/#get-an-overview-of-the-available-models", "title": "Get an overview of the available models\u00b6", "text": ""}, {"location": "examples/utilities/#use-a-custom-metric", "title": "Use a custom metric\u00b6", "text": ""}, {"location": "examples/utilities/#customize-the-estimators-parameters", "title": "Customize the estimator's parameters\u00b6", "text": ""}, {"location": "examples/utilities/#save-load", "title": "Save &amp; load\u00b6", "text": ""}, {"location": "user_guide/accelerating/", "title": "Accelerating pipelines", "text": "<p>For very large datasets, ATOM offers various ways to accelerate its pipeline:</p> <ul> <li>Run estimators on GPU</li> <li>Use a faster data engine</li> <li>Use a faster estimator engine</li> <li>Run processes in parallel</li> </ul> <p>Warning</p> <p>Performance improvements are usually noticeable for datasets larger  than ~5M rows. For smaller datasets, using other values than the default can even harm performance!</p>"}, {"location": "user_guide/accelerating/#gpu-acceleration", "title": "GPU acceleration", "text": "<p>Graphics Processing Units (GPUs) can significantly accelerate calculations for preprocessing steps or training machine learning models. Training models involves compute-intensive matrix multiplications and other operations that can take advantage of a GPU's massively parallel architecture. Training on large datasets can take hours to run on a single processor. However, if you offload those tasks to a GPU, you can reduce training time to minutes instead.</p> <p>Running transformers and models in atom using a GPU is as easy as initializing the instance with parameter <code>device=\"gpu\"</code>. The <code>device</code> parameter accepts any string that follows the SYCL_DEVICE_FILTER filter selector. Examples are:</p> <ul> <li>device=\"cpu\" (use CPU)</li> <li>device=\"gpu\" (use default GPU)</li> <li>device=\"gpu:0\" (use first GPU)</li> <li>device=\"gpu:1\" (use second GPU)</li> </ul> <p>Combine GPU acceleration with the cuml and sklearnex estimator engines. The XGBoost, LightGBM and CatBoost models come with their own GPU engine. Setting <code>device=\"gpu\"</code> is sufficient to accelerate them with GPU, regardless of the engine parameter.</p> <p>Warning</p> <p>ATOM does not support multi-GPU training. If there is more than one GPU on the machine and the <code>device</code> parameter does not specify which one to use, the first one is used by default.</p> <p>Example</p> <p> Train a model on a GPU yourself using SageMaker Studio Lab. Just click on the badge above and run the notebook! Make sure to choose the GPU compute type.</p>"}, {"location": "user_guide/accelerating/#data-acceleration", "title": "Data acceleration", "text": "<p>The data engine can be specified through the <code>engine</code> parameter, which takes a dict with a key <code>data</code> that accepts three values: numpy, pyarrow and modin.</p>"}, {"location": "user_guide/accelerating/#numpy", "title": "numpy", "text": "<p>ATOM uses <code>pandas</code> as the default library for data handling, which in turn, uses <code>numpy</code> for all data processing.</p>"}, {"location": "user_guide/accelerating/#pyarrow", "title": "pyarrow", "text": "<p>PyArrow is a library that provides a way to work with Apache Arrow memory structures. Apache Arrow is a cross-language, platform-independent, in-memory data format that provides an efficient and fast way to serialize and deserialize data. Pandas offers native integration with pyarrow, which atom uses when specifying the pyarrow data engine.</p> <p>Warning</p> <ul> <li>The pyarrow backend doesn't work for sparse datasets. If the   dataset has any sparse columns, an exception is raised.</li> <li>The LightGBM and XGBoost models don't support pyarrow   dtypes.</li> </ul>"}, {"location": "user_guide/accelerating/#modin", "title": "modin", "text": "<p>The modin library is a multi-threading, drop-in replacement for pandas, that uses Ray as backend.</p>"}, {"location": "user_guide/accelerating/#estimator-acceleration", "title": "Estimator acceleration", "text": "<p>The estimator engine can be specified through the <code>engine</code> parameter, which takes a dict with a key <code>estimator</code> that accepts three values: sklearn, sklearnex and cuml. Read here how to run the estimators on GPU instead of CPU.</p> <p>Warning</p> <p>Estimators accelerated with sklearnex or cuML sometimes use slightly different hyperparameters than their sklearn counterparts.</p>"}, {"location": "user_guide/accelerating/#sklearn", "title": "sklearn", "text": "<p>This is the default option, which uses the standard estimators from sklearn. Sklearn does not support training on GPU.</p>"}, {"location": "user_guide/accelerating/#sklearnex", "title": "sklearnex", "text": "<p>The Intel\u00ae Extension for Scikit-learn package (or sklearnex, for brevity) accelerates sklearn models and transformers, keeping full conformance with sklearn's API. Sklearnex is a free software AI accelerator that offers a way to make sklearn code 10\u2013100 times faster. The software acceleration is achieved through the use of vector instructions, IA hardware-specific memory optimizations, threading, and optimizations for all upcoming Intel platforms at launch time. See here an example using the sklearnex engine.</p> <p>Warning</p> <p>sklearnex estimators don't support sparse datasets nor multioutput tasks.</p> <p>Tip</p> <p>Intel\u00ae processors provide better performance than other CPUs.</p>"}, {"location": "user_guide/accelerating/#prerequisites", "title": "Prerequisites", "text": "<ul> <li>Operating System:<ul> <li>Linux (Ubuntu, Fedora, etc...)</li> <li>Windows 8.1+</li> <li>macOS (no GPU support)</li> </ul> </li> <li>CPU:<ul> <li>Processor must have x86 architecture.</li> <li>Processor must support at least one of SSE2, AVX, AVX2, AVX512 instruction sets.</li> <li>ARM* architecture is not supported.</li> </ul> </li> <li>GPU:<ul> <li>All Intel\u00ae integrated and discrete GPUs.</li> <li>Intel\u00ae GPU drivers.</li> </ul> </li> <li>Libraries:<ul> <li>sklearnex&gt;=2023.2.1 (automatically installed with atom when the processor has x86 architecture)</li> <li>dpcpp_cpp_rt&gt;=2023.2  (only for GPU acceleration)</li> </ul> </li> </ul>"}, {"location": "user_guide/accelerating/#supported-estimators", "title": "Supported estimators", "text": "<ul> <li>Pruner (only for strategy=\"dbscan\")</li> <li> <p>FeatureSelector (only for strategy=\"pca\" and dense datasets)</p> </li> <li> <p>ElasticNet (only for CPU acceleration)</p> </li> <li>KNearestNeighbors</li> <li>Lasso (only for CPU acceleration)</li> <li>LogisticRegression</li> <li>OrdinaryLeastSquares</li> <li>RandomForest</li> <li>Ridge (only for regression tasks and CPU acceleration)</li> <li>SupportVectorMachine (GPU acceleration only supports classification tasks)</li> </ul>"}, {"location": "user_guide/accelerating/#cuml", "title": "cuML", "text": "<p>cuML is the machine learning library of the RAPIDS project. cuML enables you to run traditional tabular ML tasks on GPUs without going into the details of CUDA programming. For large datasets, these GPU-based implementations can complete 10-50x faster than their CPU equivalents.</p> <p>Warning</p> <ul> <li>cuML estimators don't support multioutput tasks nor the pyarrow   data engine.</li> <li>Install cuML using <code>pip install --extra-index-url=https://pypi.nvidia.com   cuml-cu11</code> or <code>pip install --extra-index-url=https://pypi.nvidia.com   cuml-cu12</code> depending on your CUDA version. Read more about RAPIDS'   installation here.</li> </ul> <p>Tip</p> <p>Only transformers and predictors are converted to the requested engine. To use a metric from cuML, insert it directly in the <code>run</code> method:</p> <pre><code>from atom import ATOMClassifier\nfrom cuml.metrics import accuracy_score\nfrom sklearn.datasets import make_classification\n\nX, y = make_classification(n_samples=100, random_state=1)\n\natom = ATOMClassifier(X, y, engine={\"estimator\": \"cuml\"}, verbose=2)\natom.run(\"LR\", metric=accuracy_score)\n</code></pre>"}, {"location": "user_guide/accelerating/#prerequisites_1", "title": "Prerequisites", "text": "<ul> <li>Operating System:<ul> <li>Ubuntu 18.04/20.04 or CentOS 7/8 with gcc/++ 9.0+</li> <li>Windows 10+ with WSL2 (see here a tutorial)</li> </ul> </li> <li>GPU:<ul> <li>NVIDIA Pascal\u2122 or better with compute capability 6.0+</li> </ul> </li> <li>Drivers:<ul> <li>CUDA &amp; NVIDIA Drivers of versions 11.0, 11.2, 11.4 or 11.5</li> </ul> </li> <li>Libraries:<ul> <li>cuML&gt;=23.08</li> </ul> </li> </ul>"}, {"location": "user_guide/accelerating/#supported-estimators_1", "title": "Supported estimators", "text": "<ul> <li>Cleaner</li> <li>Discretizer</li> <li>Imputer (only for strat_num!=\"knn\")</li> <li>Normalizer</li> <li>Pruner (only for strategy=\"dbscan\" and \"hdbscan\")</li> <li>Scaler</li> <li>Vectorizer</li> <li> <p>FeatureSelector (only for strategy=\"pca\")</p> </li> <li> <p>BernoulliNB</p> </li> <li>CategoricalNB</li> <li>ElasticNet</li> <li>GaussianNB</li> <li>KNearestNeighbors</li> <li>Lasso</li> <li>LinearSVM</li> <li>LogisticRegression</li> <li>MultinomialNB</li> <li>OrdinaryLeastSquares</li> <li>RandomForest</li> <li>Ridge (only for regression tasks)</li> <li>SupportVectorMachine</li> </ul>"}, {"location": "user_guide/accelerating/#parallel-execution", "title": "Parallel execution", "text": "<p>Another way to accelerate your pipelines is executing processes in parallel. Use the <code>backend</code> parameter to select one of several parallelization backends.</p> <ul> <li>loky: Used by default, can induce some communication and memory overhead   when exchanging input and output data with the worker Python processes. On   some rare systems (such as Pyiodide), the loky backend may not be available.</li> <li>multiprocessing: Previous process-based backend based on <code>multiprocessing.Pool</code>.   Less robust than loky.</li> <li>threading: Very low-overhead backend but it suffers from the Python Global   Interpreter Lock if the called function relies a lot on Python objects. It's    mostly useful when the execution bottleneck is a compiled extension that   explicitly releases the GIL (for instance a Cython loop wrapped in a \"with nogil\"   block or an expensive call to a library such as numpy).</li> <li>ray: Ray is an open-source unified compute framework   that makes it easy to scale AI and Python workloads. Read more about Ray here.   See here an example use case.</li> </ul> <p>The parallelization backend is applied in the following cases:</p> <ul> <li>In every individual estimator that uses parallelization internally.</li> <li>To calculate cross-validated results during hyperparameter tuning.</li> <li>To train multiple models in parallel (when the trainer's <code>parallel</code> parameter is True).</li> <li>To calculate partial dependencies in plot_partial_dependence.</li> </ul> <p>Note</p> <p>The <code>njobs</code> parameter sets the number of cores for the individual models as well as for parallel training. You won't gain much training two models in parallel with 2 cores, when the models also parallelize computations internally. Instead, use parallel training for models that can't parallelize their training (their constructor doesn't have the <code>n_jobs</code> parameter).</p>"}, {"location": "user_guide/data_cleaning/", "title": "Data cleaning", "text": "<p>More often than not, you'll need to do some data cleaning before fitting your dataset to a model.  Usually, this involves importing different libraries and writing many lines of code. Since ATOM is all about fast exploration  and experimentation, it provides various data cleaning classes to apply the most common transformations fast and easy.</p> <p>Note</p> <ul> <li>All of atom's data cleaning methods automatically adopt the relevant   transformer attributes (<code>n_jobs</code>, <code>verbose</code>, <code>logger</code>, <code>random_state</code>)   from atom. A different choice can be added as parameter to the method   call, e.g., <code>atom.scale(verbose=2)</code>.</li> <li>Like the add method, the data cleaning methods   accept the <code>columns</code> parameter to only transform a subset of the   dataset's features, e.g., <code>atom.scale(columns=[0, 1])</code>. Read   more in the row and column selection section.</li> </ul> <p></p>"}, {"location": "user_guide/data_cleaning/#balancing-the-data", "title": "Balancing the data", "text": "<p>One of the common issues found in datasets that are used for classification is imbalanced classes. Data imbalance usually reflects an unequal distribution of classes within a dataset. For example, in a credit card fraud detection dataset, most of the transactions are non-fraud, and a very few cases are fraud. This leaves us with a very unbalanced ratio of fraud vs non-fraud cases. The Balancer class can oversample the minority class or undersample the majority class using any of the transformers implemented in the imblearn package. It can be  accessed from atom through the balance method.</p> <p></p>"}, {"location": "user_guide/data_cleaning/#standard-data-cleaning", "title": "Standard data cleaning", "text": "<p>There are many data cleaning steps that are useful to perform on any dataset before modeling. These are general rules that apply almost on every use-case and every task. The Cleaner class is a convenient tool to apply such steps. It can be accessed from atom through the clean method. Use the class' parameters to choose which transformations to perform. The available steps are:</p> <ul> <li>Drop columns with specific data types.</li> <li>Strip categorical features from white spaces.</li> <li>Drop duplicate rows.</li> <li>Drop rows with missing values in the target column.</li> <li>Encode the target column.</li> </ul> <p></p>"}, {"location": "user_guide/data_cleaning/#binning-numerical-features", "title": "Binning numerical features", "text": "<p>Discretization (otherwise known as quantization or binning) provides a way to partition continuous features into discrete values. Certain datasets with continuous features may benefit from discretization, because discretization can transform the dataset of continuous attributes to one with only nominal attributes. Discretization is similar to constructing histograms for continuous data. However, histograms focus on counting features which fall into particular bins, whereas discretization focuses on assigning feature values to these bins. The Discretizer class can be used to bin continuous data into intervals. It can be accessed from atom through the discretize method.</p> <p></p>"}, {"location": "user_guide/data_cleaning/#encoding-categorical-features", "title": "Encoding categorical features", "text": "<p>Many datasets contain categorical features. Their variables are typically stored as text values which represent various classes. Some examples include color (\u201cRed\u201d, \u201cYellow\u201d, \u201cBlue\u201d), size (\u201cSmall\u201d, \u201cMedium\u201d, \u201cLarge\u201d) or geographic designations (city or country). Regardless of what the value is used for, the challenge is determining how to use this data in the analysis. The majority of sklearn's models don't support direct manipulation of this kind of data. Use the Encoder class to encode categorical features to numerical values. It can be  accessed from atom through the encode method.</p> <p>There are many strategies to encode categorical columns. The Encoder class applies one strategy or another depending on the number of classes in the column to be encoded. When there are only two, the values are encoded with 0 or 1. When there are more than two, the columns can be encoded using one-hot encoding or any other strategy of the category-encoders package, depending on the value of the <code>max_onehot</code> parameter. One-hot encodes the column making a dummy feature for every class. This approach preserves all the information but increases the size of the dataset considerably, making it often an undesirable strategy for high cardinality features. Other strategies like Target transform the column in place.</p> <p></p>"}, {"location": "user_guide/data_cleaning/#imputing-missing-values", "title": "Imputing missing values", "text": "<p>For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with ATOM's models which assume that all values in an array are numerical, and that all have and hold meaning. The Imputer class handles missing values in the dataset by either dropping or imputing the value. It can be accessed from atom through the impute method.</p> <p></p>"}, {"location": "user_guide/data_cleaning/#normalizing-the-feature-set", "title": "Normalizing the feature set", "text": "<p>Use the Normalizer class to transform the feature set to follow a Normal (Gaussian)-like distribution. In general, data must be transformed when using models that assume normality in the residuals. Examples of such models are LogisticRegression, LinearDiscriminantAnalysis and GaussianNB. The class can be accessed from atom through the normalize method.</p> <p></p>"}, {"location": "user_guide/data_cleaning/#handling-outliers", "title": "Handling outliers", "text": "<p>When modeling, it is important to clean the data sample to ensure that the observations best represent the problem. Sometimes a dataset can contain extreme values that are outside the range of what is expected and unlike the other data. These are called outliers. Often, machine learning modeling and model skill in general can be improved by  understanding and even removing these outlier samples. The Pruner class offers 7 different strategies to detect outliers (described hereunder). It can be accessed from atom through the prune method.</p> <p>z-score The z-score of a value in the dataset is defined as the number of standard deviations by which the value is above or below the mean of the column. Values above or below a certain threshold (specified with the parameter <code>max_sigma</code>) are considered outliers. Note that, contrary to the rest of the strategies, this approach selects outlier values, not outlier samples! Because of this, it is possible to replace the outlier value instead of dropping the entire sample.</p> <p>Isolation Forest Uses a tree-based anomaly detection algorithm. It is based on modeling the normal data in such a way as to isolate anomalies that are both few and different in the feature space. Read more in sklearn's documentation.</p> <p>Elliptic Envelope If the input variables have a Gaussian distribution, then simple statistical methods can be used to detect outliers. For example, if the dataset has two input variables and both are Gaussian, the feature space forms a multidimensional Gaussian, and knowledge of this distribution can be used to identify values far from the distribution. This approach can be generalized by defining a hypersphere (ellipsoid) that covers the normal data, and data that falls outside this shape is considered an outlier. Read more in sklearn's documentation.</p> <p>Local Outlier Factor A simple approach to identifying outliers is to locate those examples that are far from the other examples in the feature space. This can work well for feature spaces with low dimensionality (few features) but becomes less reliable as the number of features is increased. The local outlier factor is a technique that attempts to harness the idea of nearest neighbors for outlier detection. Each example is assigned a score of how isolated or how likely it is to be outliers based on the size of its local neighborhood. Those examples with the largest score are more likely to be outliers. Read more in sklearn's documentation.</p> <p>One-class SVM The support vector machine algorithm, initially developed for binary classification tasks, can also be used for one-class classification. When modeling one class, the algorithm captures the density of the majority class and classifies examples on the extremes of the density function as outliers. This modification of SVM is referred to as One-Class SVM. Read more in sklearn's documentation.</p> <p>DBSCAN The DBSCAN algorithm views clusters as areas of high density separated by areas of low density. Due to this rather generic view, clusters found by DBSCAN can be any shape, as opposed to k-means which assumes that clusters are convex shaped. Samples that lie outside any cluster are considered outliers. Read more in sklearn's documentation.</p> <p>OPTICS The OPTICS algorithm shares many similarities with the DBSCAN algorithm, and can be considered a generalization of DBSCAN that relaxes the <code>eps</code> requirement from a single value to a value range. The key difference between DBSCAN and OPTICS is that the OPTICS algorithm builds a reachability graph, and a spot within the cluster ordering. These two attributes are assigned when the model is fitted, and are used to determine cluster membership. Read more in sklearn's documentation.</p> <p></p>"}, {"location": "user_guide/data_cleaning/#scaling-the-feature-set", "title": "Scaling the feature set", "text": "<p>Standardization of a dataset is a common requirement for many machine learning estimators; they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with zero mean and unit variance). The Scaler class let you quickly scale atom's dataset using one of sklearn's scalers. It can be accessed from atom through the scale method. </p> <p>Info</p> <p>All strategies can utilize GPU speed-up. Click here for further information about GPU acceleration.</p>"}, {"location": "user_guide/data_management/", "title": "Data management", "text": ""}, {"location": "user_guide/data_management/#data-sets", "title": "Data sets", "text": "<p>ATOM is designed to work around one single dataset: the one with which atom is initialized. This is the dataset you want to explore, transform, and use for model training and validation. ATOM differentiates three different data sets:</p> <ul> <li>The training set is usually the largest of the data sets. As the   name suggests, this set is used to train the pipeline. During   hyperparameter tuning, only the training set is used to fit and   evaluate the estimator in every call. The training set in the current   branch can be accessed through the <code>train</code> attribute. It's   features and target can be accessed through <code>X_train</code> and <code>y_train</code>   respectively.</li> <li>The test set is used to evaluate the models. The model scores on   this set give an indication on how the model performs on new data. The   test set can be accessed through the <code>test</code> attribute. It's features   and target can be accessed through <code>X_test</code> and <code>y_test</code> respectively.</li> <li>The holdout set is an optional, separate set that should only be   used to evaluate the final model's performance. Create this set when   you are going to use the test set for an intermediate validation step.   The holdout set is immediately set apart during initialization and is   not considered part of atom's dataset (the <code>dataset</code> attribute only   returns the train and test sets). The holdout set is left untouched   until predictions are made on it, i.e., it does not undergo any pipeline   transformations until the data set is requested for the first time.   The holdout set is stored in atom's <code>holdout</code> attribute. See   herean example that shows how to use the holdout   data set.</li> </ul> <p>The data can be provided in different formats. If the data sets are not specified beforehand, you can input the features and target separately or together:</p> <ul> <li>X</li> <li>X, y</li> </ul> <p>Remember to use the <code>y</code> parameter to indicate the target column in X when using the first option. If not specified, the last column in X is used as target. In both these cases, the size of the sets are defined using the <code>test_size</code> and <code>holdout_size</code> parameters. Note that the splits are made after the subsample of the dataset with the <code>n_rows</code> parameter (when not left to its default value).</p> <p>If you already have the separate data sets, provide them using one of the following formats:</p> <ul> <li>train, test</li> <li>train, test, holdout</li> <li>X_train, X_test, y_train, y_test</li> <li>X_train, X_test, X_holdout, y_train, y_test, y_holdout</li> <li>(X_train, y_train), (X_test, y_test)</li> <li>(X_train, y_train), (X_test, y_test), (X_holdout, y_holdout)</li> </ul> <p>The input data is always converted internally to a dataframe, if it isn't one already. The column names should always be strings. If they are not, atom changes their type at initialization. If no column names are provided, default names are given of the form <code>X[N-1]</code>, where N stands for the n-th feature in the dataset.</p> <p></p>"}, {"location": "user_guide/data_management/#indexing", "title": "Indexing", "text": "<p>By default, atom resets the dataframe's index after initialization and after every transformation in the pipeline. To avoid this, specify the <code>index</code> parameter. If the dataset has an 'identifier' column, it is useful to use it as index for two reasons:</p> <ul> <li>An identifier doesn't usually contain any useful information   on the target column, and should therefore be removed before training.</li> <li>Predictions of specific rows can be accessed through their index.</li> </ul> <p>Warning</p> <p>Avoid duplicate indices in the dataframe. Having them raises an error when initializing atom and may potentially lead to unexpected behavior if introduced later.</p> <p></p>"}, {"location": "user_guide/data_management/#sparse-datasets", "title": "Sparse datasets", "text": "<p>If atom is initialized using a scipy sparse matrix, it is converted internally to a dataframe of sparse columns. Read more about pandas' sparse data structures here. The same conversion takes place when a transformer returns a sparse matrix, like for example, the Vectorizer.</p> <p>Note that ATOM considers a dataset to be sparse if any of the columns is sparse. A dataset can only benefit from sparsity when all its columns are sparse, hence mixing sparse and non-sparse columns is not recommended and can cause estimators to decrease their training speed or even crash. Use the shrink method to convert dense features to sparse and the available_models method to check which models have native support for sparse matrices.</p> <p>Click here to see an example that uses sparse data.</p> <p>Warning</p> <p>Estimators accelerated using sklearnex don't support sparse  datasets.</p> <p></p>"}, {"location": "user_guide/data_management/#multioutput-tasks", "title": "Multioutput tasks", "text": "<p>Multioutput is a task where there are more than one target column, i.e., the goal is to predict multiple targets at the same time. When providing a dataframe as target, use the y parameter. Providing <code>y</code> without keyword makes ATOM think you are providing <code>train, test</code> (see the data sets section).</p>"}, {"location": "user_guide/data_management/#task-types", "title": "Task types", "text": "<p>ATOM recognizes four multioutput tasks.</p> <p>Note</p> <p>Combinations of binary and multiclass target columns are treated as multiclass-multioutput tasks.</p>"}, {"location": "user_guide/data_management/#multilabel", "title": "Multilabel", "text": "<p>Multilabel is a classification task, labeling each sample with <code>m</code> labels from <code>n_classes</code> possible classes, where <code>m</code> can be 0 to <code>n_classes</code> inclusive. This can be thought of as predicting properties of a sample that are not mutually exclusive.</p> <p>For example, prediction of the topics relevant to a text document. The document may be about one of religion, politics, finance or education, several of the topic classes or all of the topic classes. The target column (<code>atom.y</code>) could look like this:</p> <pre><code>0                        [politics]\n1               [religion, finance]\n2    [politics, finance, education]\n3                                []\n4                         [finance]\n5               [finance, religion]\n6                         [finance]\n7               [religion, finance]\n8                       [education]\n9     [finance, religion, politics]\n\nName: target, dtype: object\n</code></pre> <p>A model can not directly ingest a variable amount of target classes. Use the clean method to assign a binary output to each class, for every sample. Positive classes are indicated with 1 and negative classes with 0. It is thus comparable to running n_classes binary classification tasks. In our example, the target (<code>atom.y</code>) is converted to:</p> <pre><code>   education  finance  politics  religion\n0          0        0         1         0\n1          0        1         0         1\n2          1        1         1         0\n3          0        0         0         0\n4          0        1         0         0\n5          0        1         0         1\n6          0        1         0         0\n7          0        1         0         1\n8          1        0         0         0\n9          0        1         1         1\n</code></pre>"}, {"location": "user_guide/data_management/#multiclass-multioutput", "title": "Multiclass-multioutput", "text": "<p>Multiclass-multioutput (also known as multitask classification) is a classification task which labels each sample with a set of non-binary properties. Both the number of properties and the number of classes per property is greater than 2. A single estimator thus handles several joint classification tasks. This is both a generalization of the multilabel classification task, which only considers binary attributes, as well as a generalization of the multiclass classification task, where only one property is considered.</p> <p>For example, classification of the properties \"type of fruit\" and \"colour\" for a set of images of fruit. The property \"type of fruit\" has the possible classes: \"apple\", \"pear\" and \"orange\". The property \"colour\" has the possible classes: \"green\", \"red\", \"yellow\" and \"orange\". Each sample is an image of a fruit, a label is output for both properties and each label is one of the possible classes of the corresponding property.</p>"}, {"location": "user_guide/data_management/#multioutput-regression", "title": "Multioutput regression", "text": "<p>Multioutput regression predicts multiple numerical properties for each sample. Each property is a numerical variable and the number of properties to be predicted for each sample is &gt;= 2. Some estimators that support multioutput regression are faster than just running n_output estimators.</p> <p>For example, prediction of both wind speed and wind direction, in degrees, using data obtained at a certain location. Each sample would be data obtained at one location and both wind speed and direction would be output for each sample.</p>"}, {"location": "user_guide/data_management/#multivariate", "title": "Multivariate", "text": "<p>Multivariate is the multioutput task for forecasting. In this case, we try to forecast more than one time series at the same time.</p> <p>Although all forecasting models in ATOM support multivariate tasks, we differentiate two types of models:</p> <ul> <li>The \"native multivariate\" models apply forecasts where every prediction   of endogeneous (<code>y</code>) variables will depend on values of the other target   columns.</li> <li>The rest of the models apply an estimator per column, meaning that forecasts   will be made per endogeneous variable, and not be affected by other variables.   To access the column-wise estimators, use the estimator's <code>forecasters_</code>   parameter, which stores the fitted forecasters in a dataframe.</li> </ul> <p>Read more about time series tasks here.</p>"}, {"location": "user_guide/data_management/#native-multioutput-models", "title": "Native multioutput models", "text": "<p>Some models have native support for multioutput tasks. This means that the original estimator is used to make predictions directly on all the target columns. Examples of such models are KNearestNeighbors, RandomForest and ExtraTrees.</p>"}, {"location": "user_guide/data_management/#non-native-multioutput-models", "title": "Non-native multioutput models", "text": "<p>The majority of the models don't have integrated support for multioutput tasks. However, it's possible to still use them for such tasks, wrapping them in a meta-estimator capable of handling multiple target columns. For non-native multioutput models, ATOM does so automatically. For multilabel tasks, the meta-estimator is:</p> <ul> <li>ClassifierChain</li> </ul> <p>And for multiclass-multioutput and multioutput regression, the meta-estimators are respectively:</p> <ul> <li>MultioutputClassifier</li> <li>MultioutputRegressor</li> </ul> <p>Warning</p> <p>Currently, scikit-learn metrics do not support multiclass-multioutput classification tasks. In this case, ATOM calculates the mean of the selected metric over every individual target.</p> <p>Tip</p> <ul> <li>Set the <code>native_multilabel</code> or <code>native_multioutput</code> parameter in ATOMModel equal to <code>True</code> to ignore the meta-estimator for custom models.</li> <li>Check out the multilabel classification and multioutput regression examples.</li> </ul> <p></p>"}, {"location": "user_guide/data_management/#branches", "title": "Branches", "text": "<p>You might want to compare how a model performs on a dataset transformed through multiple pipelines, each using different transformers. For example, on one pipeline with an undersampling strategy and the other with an oversampling strategy. To be able to do this, ATOM has a branching system.</p> <p>The branching system helps the user to manage multiple data pipelines within the same atom instance. Branches are created and accessed through atom's <code>branch</code> property. A branch contains a specific pipeline, the dataset transformed through that pipeline, and all data and utility attributes that refer to that dataset. Transformers and models called from atom use the dataset in the current branch, as well as data attributes such as <code>atom.dataset</code>. It's not allowed to change the data in a branch after fitting a model with it. Instead, create a new branch for every unique pipeline.</p> <p>By default, atom starts with one branch called \"main\". To start a new branch, set a new name to the property, e.g., <code>atom.branch = \"undersample\"</code>. This creates a new branch from the current one. To create a branch from any other branch type \"_from_\" between the new name and the branch from which to split, e.g., <code>atom.branch = \"oversample_from_main\"</code> creates branch \"oversample\" from branch \"main\", even if the current branch is \"undersample\". To switch between existing branches, just type the name of the desired branch, e.g., <code>atom.branch = \"main\"</code> brings you back to the main branch. Note that every branch contains a unique copy of the whole dataset! Creating many branches can cause memory issues for large datasets.</p> <p>See the Imbalanced datasets or Feature engineering examples for branching use cases.</p> <p>Warning</p> <p>Always create a new branch if you want to change the dataset after fitting a model! Forcing a data change through the data property's <code>@setter</code> can cause unexpected model behavior and break down the plotting methods.</p> <p></p> <p> </p> Figure 1. Diagram of a possible branch system to compare an oversampling with an undersampling pipeline. <p></p>"}, {"location": "user_guide/data_management/#memory-considerations", "title": "Memory considerations", "text": "<p>An atom instance stores one copy of the dataset for each branch (this doesn't include the holdout set, which is only stored once), and one copy of the initial dataset with which the instance is initialized. This copy of the original dataset is necessary to avoid data leakage during hyperparameter tuning and for some specific methods like cross_validate and reset. It's created as soon as there are no branches in the initial state (usually after calling the first data transformation). If the dataset is occupying too much memory, consider using the shrink method to convert the dtypes to their smallest possible matching dtype.</p> <p>When working with large datasets and multiple branches, it becomes impossible to store all branches in memory at the same time. To avoid out-of-memory errors, use atom's <code>memory</code> parameter. If not <code>False</code>, atom saves the data of inactive branches as well as the original branch at the specified location (in a directory called <code>joblib</code>, the name of the underlying library managing the caching), maintaining only the current active branch in memory. This mechanism results in a slight drop in performance because of the I/O overhead, but can save a lot of memory. Additionally, the memory's location is also used to cache the output of the <code>fit</code> method of transformers in the pipeline. See here an example using the memory parameter.</p> <p>Apart from the dataset itself, a model's metric scores and shap values are also stored as attributes of the model to avoid having to recalculate them every time they are needed. You can delete all these attributes using the clear method in order to free some memory before saving atom.</p> <p></p>"}, {"location": "user_guide/data_management/#data-transformations", "title": "Data transformations", "text": "<p>Performing data transformations is a common requirement of many datasets before they are ready to be ingested by a model. ATOM provides various classes to apply data cleaning and feature engineering transformations to the data. This tooling should be able to help you apply most of the typically needed transformations to get the data ready for modeling. For further fine-tuning, it's also possible to transform the data using custom transformers (see the add method) or through a function (see the apply method). Remember that all transformations are only applied to the dataset in the current branch.</p>"}, {"location": "user_guide/data_management/#row-and-column-selection", "title": "Row and column selection", "text": "<p>Many methods in atom contain the <code>rows</code> or <code>columns</code> parameter to select a subset of the dataset. Examples are the evaluate and save_data methods for <code>rows</code>, and the distribution and shrink methods for <code>columns</code>. All data cleaning and feature engineering methods use the <code>columns</code> parameter to apply the transformation only to that selection of columns, and all prediction methods use the <code>rows</code> parameter to make predictions on that selection of rows.</p> <p>As you can see, these two parameters are very important and shared across many methods in atom. Rows and columns can be selected in multiple ways. The check is performed in the order described hereunder:</p> <ol> <li>By actual dataset, e.g., <code>rows=atom.test</code> is equal to <code>rows=\"test\"</code>.</li> <li>By range or slice, e.g., <code>rows=range(100)</code> to select the first 100    rows from the dataset or <code>rows=slice(20, 100)</code> to select rows 20 to 99.</li> <li>By exact name, e.g., <code>rows=[\"row1\", \"row2\"]</code> to select rows with    indices <code>row1</code> and <code>row2</code> or <code>columns=[\"col1\", \"col2\"]</code> to select    columns <code>col1</code> and <code>col2</code>. It's also possible to use the <code>+</code> sign to select    multiple rows or columns, e.g., <code>columns=\"col1+col2</code> is the same    as <code>columns=[\"col1\", \"col2\"]</code>.</li> <li>By position, e.g., <code>rows=[0, 1, 2]</code> to select the first three rows.</li> <li>By name of the data set (only for rows), e.g., <code>rows=\"train\"</code> to    select all rows in the training set, or <code>rows=\"test+holdout\"</code> to    select all rows in the test and holdout sets. Valid data sets are <code>dataset</code>,    <code>train</code>, <code>test</code> and <code>holdout</code>.</li> <li>By dtype (only for columns), e.g., <code>columns=\"number\"</code> to select only     numerical columns. See pandas' user guide.</li> <li>By regex match, e.g., <code>columns=\"mean_.*\"</code> to select all columns    starting with <code>mean_</code>.</li> <li>Excluding instead of including using the <code>!</code> sign, e.g. <code>columns=\"!col1\"</code>    to select all columns except <code>col1</code>. You can also exclude multiple rows or    columns like this <code>columns=[\"!col1\", \"!col2\"]</code> or this    <code>columns=\"!col1+!col2\"</code>. It's also possible to exclude data sets    for row selection, e.g., <code>columns=\"!train\"</code> or dtypes for column    selection, e.g., <code>columns=\"!number\"</code>. Note that if a column name    starts with <code>!</code>, the selection of that name will take priority over exclusion.    Rows and columns can only be included or excluded, and not both at the same    time. For example, this selection raises an exception <code>column=[\"col1\", \"!col2\"]</code>.</li> </ol> <p>Info</p> <p>In some plotting methods, it's possible to plot separate lines for different subsets of the rows. For example, to compare the results on the train and test set. For these cases, either provide a sequence to the <code>rows</code> parameter for every line you want to draw, e.g., <code>atom.plot_roc(rows=(\"train\", \"test\"))</code>, or provide a dictionary where the keys are the names of the sets (used in the legend) and the values are the corresponding selection of rows, selected using any of the aforementioned approaches, e.g, <code>atom.plot_roc(rows={\"0-99\": range(100), \"100-199\": range(100, 200})</code>. Note that for these methods, using <code>atom.plot_roc(rows=\"train+test\")</code>, only plots one line with the data from both sets. See the advanced plotting example.</p>"}, {"location": "user_guide/feature_engineering/", "title": "Feature engineering", "text": "<p>Feature engineering is the process of creating new features from the existing ones, in order to capture relationships with the target column that the first set of features didn't have on their own. This process is very important to improve the performance of machine learning algorithms. Although feature engineering works best when the data  scientist applies use-case specific transformations, there are ways to do this in an automated manner, without prior domain knowledge. One of the problems of creating new features without human expert intervention, is that many of the newly created features can be useless, i.e., they do not help the algorithm to make better predictions. Even worse, having useless features can drop your performance. To avoid this, we perform feature selection, a process in which we select the relevant features  in the dataset. See the Feature engineering example.</p> <p>Note</p> <ul> <li>All of atom's feature engineering methods automatically adopt the relevant   transformer attributes (<code>n_jobs</code>, <code>verbose</code>, <code>logger</code>, <code>random_state</code>) from   atom. A different choice can be added as parameter to the method call,   e.g., <code>atom.feature_selection(\"pca\", n_features=10, random_state=2)</code>.</li> <li>Like the add method, the feature engineering   methods accept the <code>columns</code> parameter to only transform a subset of the   dataset's features, e.g., <code>atom.feature_selection(\"pca\",n_features=10, columns=slice(5, 15))</code>. Read more in the   row and column selection section.</li> </ul> <p></p>"}, {"location": "user_guide/feature_engineering/#extracting-datetime-features", "title": "Extracting datetime features", "text": "<p>Features that contain dates or timestamps can not be directly ingested by models since they are not strictly numerical. Encoding them as categorical features is not an option since the encoding does not capture the relationship between the different moments in time. The FeatureExtractor class creates new features extracting datetime elements (e.g., day, month, year, hour...) from the columns. It can be accessed from atom through the feature_extraction method. The new features are named equally to the column from which they are extracted, followed by an underscore and the datetime element they create, e.g., <code>x0_day</code> for the day element of <code>x0</code>.</p> <p>Note that many time features have a cyclic pattern, e.g., after Sunday comes Monday. This means that if we would encode the days of the week from 0 to 6, we would lose that relation. A common method used to encode cyclical features is to transform the data into two dimensions using a sine and cosine transformation:</p> \\[ x_{sin} = sin\\left(\\frac{2\\pi * x}{max(x)}\\right) \\] \\[ x_{cos} = cos\\left(\\frac{2\\pi * x}{max(x)}\\right) \\] <p>The resulting features have their names followed by sin or cos, e.g. <code>x0_day_sin</code> and <code>x0_day_cos</code>. The datetime elements that can be encoded in a cyclic fashion are: microsecond, second, minute, hour, weekday, day, day_of_year, month and quarter. Note that decision trees based algorithms build their split rules according to one feature at a time. This means that they will fail to correctly process cyclic features since the sin/cos values are expected to be considered as one single coordinate system.</p> <p>Use the <code>fmt</code> parameter to specify your feature's format in case the column is categorical. The FeatureExtractor class will convert the column to the datetime dtype before extracting the specified features. Click here for an overview of the available formats.</p> <p></p>"}, {"location": "user_guide/feature_engineering/#generating-new-features", "title": "Generating new features", "text": "<p>The FeatureGenerator class creates new non-linear features based on the original feature set. It can be accessed from atom through the feature_generation method. You can choose between two strategies: Deep Feature Synthesis and Genetic Feature Generation.</p> <p>Deep Feature Synthesis Deep feature synthesis (DFS) applies the selected operators on the features in the dataset. For example, if the operator is \"log\", it will create the new feature <code>LOG(old_feature)</code> and if the operator is \"mul\", it will create the new feature <code>old_feature_1 x old_feature_2</code>. The operators can be chosen through the <code>operators</code> parameter. Choose from:</p> <ul> <li>add: Take the sum of two features.</li> <li>sub: Subtract two features from each other.</li> <li>mul: Multiply two features with each other.</li> <li>div: Divide two features with each other.</li> <li>abs: Calculate the absolute value of a feature.</li> <li>srqt: Calculate the square root of a feature.</li> <li>log: Calculate the natural logarithm of a feature.</li> <li>sin: Calculate the sine of a feature.</li> <li>cos: Calculate the cosine of a feature.</li> <li>tan: Calculate the tangent of a feature.</li> </ul> <p>ATOM's implementation of DFS uses the featuretools package.</p> <p></p> <p>Genetic Feature Generation Genetic feature generation (GFG) uses genetic programming, a branch of evolutionary programming, to determine which features are successful and create new ones based on those. Where dfs can be seen as some kind of \"brute force\" for feature engineering, gfg tries to improve its features with every generation of the algorithm. gfg uses the same operators as dfs, but instead of only applying the transformations once, it evolves them further, creating nested structures of combinations of features. The new features are given the name <code>feature_n</code>, where n stands for the n-th feature in the dataset. You can access the genetic feature's fitness and description (how they are calculated) through the <code>genetic_features</code> attribute.</p> <p>ATOM uses the SymbolicTransformer class from the gplearn package for the genetic algorithm. Read more about this implementation here.</p> <p></p>"}, {"location": "user_guide/feature_engineering/#grouping-similar-features", "title": "Grouping similar features", "text": "<p>When your dataset contains many similar features corresponding to a certain natural group or entity, it's possible to replace these features for a handful of them, that should capture the relations of the group, in order to lose as little information as possible. To achieve this, the FeatureGrouper class computes certain statistical properties that describe the group's distribution, like the mean or the median, and replaces the columns with the result of these statistical calculations over every row in the dataset. The goal of this approach is to reduce the number of columns in the dataset, avoiding the curse of dimensionality.</p> <p></p>"}, {"location": "user_guide/feature_engineering/#selecting-useful-features", "title": "Selecting useful features", "text": "<p>The FeatureSelector class provides tooling to select the relevant features from a dataset. It can be accessed from atom through the feature_selection method.</p> <p></p>"}, {"location": "user_guide/feature_engineering/#standard-strategies", "title": "Standard strategies", "text": "<p> Univariate Univariate feature selection works by selecting the best features based on univariate statistical F-test. The test is provided via the <code>solver</code> parameter. It takes any function taking two arrays (X, y), and returning arrays (scores, p-values). Read more in sklearn's documentation.</p> <p></p> <p> Principal Components Analysis Applying PCA reduces the dimensionality of the dataset by maximizing the variance of each dimension. The new features are called <code>pca0</code>, <code>pca1</code>, etc... PCA can be applied in three ways:</p> <ul> <li>If the data is dense (i.e., not sparse), the estimator used is PCA.   Before fitting the transformer, the data is scaled to mean=0 and std=1   if it wasn't already. Read more in sklearn's documentation.</li> <li>If the data is [sparse][sparse datasets] (often the case for term-document   matrices, see Vectorizer), the estimator used is TruncatedSVD.   Read more in sklearn's documentation.</li> <li>If <code>engine</code> is \"sklearnex\" or \"cuml\", the estimator   used is the package's PCA implementation. Sparse data is not supported for   neither engine.</li> </ul> <p></p> <p> Selection from model SFM uses an estimator with <code>feature_importances_</code> or <code>coef_</code> attributes to select the best features in a dataset based on importance weights. The estimator is provided through the <code>solver</code> parameter and can be already fitted. ATOM allows you to use one its predefined models, e.g., <code>solver=\"RF\"</code>. If you didn't call the FeatureSelector through atom, don't forget to indicate the estimator's task adding <code>_class</code> or <code>_reg</code> after the name, e.g., <code>RF_class</code> to use a random forest classifier. Read more in sklearn's documentation.</p> <p></p> <p> Sequential Feature Selection Sequential feature selection adds (forward selection) or removes (backward selection) features to form a feature subset in a greedy fashion. At each stage, this estimator chooses the best feature to add or remove based on the cross-validation score of an estimator. Read more in sklearn's documentation.</p> <p></p> <p> Recursive Feature Elimination Select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features, and the importance of each feature is obtained either through a <code>coef_</code> or through a <code>feature_importances_</code> attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. Note that, since RFE needs to fit the model again every iteration, this method can be fairly slow.</p> <p>RFECV applies the same algorithm as RFE but uses a cross-validated metric (under the scoring parameter, see RFECV) to assess every step's performance. Also, where RFE returns the number of features selected by <code>n_features</code>, RFECV returns the number of features that achieved the optimal score on the specified metric. Note that this is not always equal to the amount specified by <code>n_features</code>. Read more in sklearn's documentation.</p> <p></p>"}, {"location": "user_guide/feature_engineering/#advanced-strategies", "title": "Advanced strategies", "text": "<p>The following strategies are a collection of nature-inspired optimization algorithms that maximize an objective function. If not manually specified, the function calculates the cross-validated score of a model on the data. Use the <code>scoring</code> parameter (not present in description, part of kwargs) to specify the metric to optimize on.</p> <p></p> <p> Particle Swarm Optimization Particle Swarm Optimization (PSO) optimizes a problem by having a population of candidate solutions (particles), and moving them around in the search-space according to simple mathematical formula over the particle's position and velocity. Each particle's movement is influenced by its local best known position, but is also guided toward the best known positions in the search-space, which are updated as better positions are found by other particles. This is expected to move the swarm toward the best solutions. Read more here.</p> <p></p> <p> Harris Hawks Optimization Harris Hawks Optimization (HHO) mimics the action and reaction of Hawk's team collaboration hunting in nature and prey escaping to discover the solutions of the single-objective problem. Read more here.</p> <p></p> <p> Grey Wolf Optimization The Grey Wolf Optimizer (GWO) mimics the leadership hierarchy and hunting mechanism of grey wolves in nature. Four types of grey wolves such as alpha, beta, delta, and omega are employed for simulating the leadership hierarchy. In addition, three main steps of hunting, searching for prey, encircling prey, and attacking prey, are implemented to perform optimization. Read more here.</p> <p></p> <p> Dragonfly Optimization The Dragonfly Algorithm (DFO) algorithm originates from static and dynamic swarming behaviours. These two swarming behaviours are very similar to the two main phases of optimization using meta-heuristics: exploration and exploitation. Dragonflies create sub swarms and fly over different areas in a static swarm, which is the main objective of the exploration phase. In the static swarm, however, dragonflies fly in bigger swarms and along one direction, which is favourable in the exploitation phase. Read more here.</p> <p></p> <p> Genetic Optimization Genetic Optimization is a metaheuristic inspired by the process of natural selection that belongs to the larger class of evolutionary algorithms. Genetic algorithms are commonly used to generate high-quality solutions to optimization and search problems by relying on biologically inspired operators such as mutation, crossover and selection. Read more here.</p> <p></p>"}, {"location": "user_guide/feature_engineering/#other-selection-methods", "title": "Other selection methods", "text": "<p>Removing features with low or high variance Variance is the expectation of the squared deviation of a random variable from its mean. Features with low variance have many values repeated, which means the model can't learn much from them. In a similar way, features with very high variance have very few values repeated, which makes it also difficult for a model to learn from this feature.</p> <p>FeatureSelector removes a categorical feature when the maximum number of occurrences for any value is below <code>min_repeated</code> or when the same value is repeated in at least <code>max_repeated</code> fraction of the rows. The default option is to remove a feature if all values in it are either different or exactly the same.</p> <p></p> <p>Removing features with multi-collinearity Two features that are highly correlated are redundant, i.e., two will not contribute more to the model than only one of them. FeatureSelector will drop a feature that has a Pearson correlation coefficient larger than <code>max_correlation</code> with another feature. A correlation of 1 means the two columns are equal. A dataframe of the removed features and their correlation values can be accessed through the <code>collinear</code> attribute.</p>"}, {"location": "user_guide/introduction/", "title": "Introduction", "text": "<p>There is no magic formula in data science that can tell us which type of machine learning estimator in combination with which pipeline will perform best for a given raw dataset. Different models are better suited for different types of data and different types of problems. You can follow some rough guide on how to approach problems with regard to which model to try, but these are incomplete at best.</p> <p>During the exploration phase of a machine learning project, a data scientist tries to find the optimal pipeline for his specific use case. This usually involves applying standard data cleaning steps, creating or selecting useful features, trying out different models, etc. Testing multiple pipelines requires many lines of code, and writing it all in the same notebook often makes it long and cluttered. On the other hand, using multiple notebooks makes it harder to compare the results and to keep an overview. On top of that, refactoring the code for every test can be quite time-consuming. How many times have you conducted the same action to pre-process a raw dataset? How many times have you copy-and-pasted code from an old repository to re-use it in a new use case?</p> <p>Although best practices tell us to start with a simple model and build up to more complicated ones, many data scientists just use the model best known to them in order to avoid the aforementioned problems. This can result in poor performance (because the model is just not the right one for the task) or in inefficient management of time and computing resources (because a simpler/faster model could have achieved a similar performance).</p> <p>ATOM is here to help solve these common issues. The package acts as a wrapper of the whole machine learning pipeline, helping the data scientist to rapidly find a good model for his problem. Avoid endless imports and documentation lookups. Avoid rewriting the same code over and over again. With just a few lines of code, it's now possible to perform basic data cleaning steps, select relevant features and compare the performance of multiple models on a given dataset, providing quick insights on which pipeline performs best for the task at hand.</p> <p>It is important to realize that ATOM is not here to replace all the work a data scientist has to do before getting his model into production. ATOM doesn't spit out production-ready models just by tuning some parameters in its API. After helping you determine the right pipeline, you will most probably need to fine-tune it using use-case specific features and data cleaning steps in order to achieve maximum performance.</p> <p>Example steps taken by ATOM's pipeline:</p> <ol> <li>Data Cleaning<ul> <li>Handle missing values</li> <li>Encode categorical features</li> <li>Detect and remove outliers</li> <li>Balance the training set</li> </ul> </li> <li>Feature engineering<ul> <li>Create new non-linear features</li> <li>Select the most promising features</li> </ul> </li> <li>Train and validate multiple models<ul> <li>Apply hyperparameter tuning</li> <li>Fit the models on the training set</li> <li>Evaluate the results on the test set</li> </ul> </li> <li>Analyze the results<ul> <li>Get the scores on various metrics</li> <li>Make plots to compare the model performances</li> </ul> </li> </ol> <p></p> <p></p> Figure 1. Diagram of a possible pipeline created by ATOM."}, {"location": "user_guide/logging/", "title": "Logging &amp; Tracking", "text": ""}, {"location": "user_guide/logging/#logging", "title": "Logging", "text": "<p>To start logging your experiments, fill the <code>logger</code> parameter with the name or path to store the logging file. If automatic naming is used, the file is saved using the __name__ of the class followed by the timestamp of the logger's creation, e.g. <code>ATOMClassifier_11May21_20h11m03s</code>. The logging file contains method calls, all printed messages to stdout with maximum verbosity, and any exception raised during running. Additionally, the logging entries of external libraries are redirected to the same file handler.</p> <p></p>"}, {"location": "user_guide/logging/#tracking", "title": "Tracking", "text": "<p>ATOM uses MLflow Tracking as a backend API and UI for logging models, parameters, pipelines, data and plots. Start tracking your experiments assigning a name to the <code>experiment</code> parameter. Every model is tracked using a separate run. When no backend is configured, the data is stored locally at <code>./mlruns</code>. To configure the backend, use mlflow.set_tracking_uri in your notebook or IDE before initializing atom. This does not affect the currently active run (if one exists), but takes effect for successive runs. Run <code>mlflow ui</code> on your terminal to open MLflow's Tracking UI and  view it at http://localhost:5000.</p> <p>Note</p> <p>When using ATOM on Databricks, the experiment's name should include the complete path to the storage, e.g., <code>/Users/username@domain.com/experiment_name</code>.</p> <p>Example</p> <pre><code>from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, experiment=\"breast_cancer\")\natom.run(models=[\"LR\", \"RF\", \"LGB\"], n_trials=(0, 0, 10))\n</code></pre> <p></p> <p></p>"}, {"location": "user_guide/logging/#dagshub-integration", "title": "DAGsHub integration", "text": "<p>ATOM has a build-in integration with DAGsHub, a web platform based on open source tools, optimized for data science and oriented towards the open source community. To store your mlflow experiments in a DAGsHub repo, type <code>dagshub:&lt;experiment_name&gt;</code> in the <code>experiment</code> parameter (instead of just the experiment's name). If the repo does not already exist, a new public repo is created.</p> <p>Info</p> <p>If you are logged into your DAGsHub account when initializing atom with a dagshub experiment, a page on your web browser is automatically opened to give access permissions. If not, read here how to set up your DAGsHub credentials.</p> <p>Example</p> <pre><code>from atom import ATOMClassifier\nfrom sklearn.datasets import load_breast_cancer\n\nX, y = load_breast_cancer(return_X_y=True, as_frame=True)\n\natom = ATOMClassifier(X, y, experiment=\"dagshub:breast_cancer\")\natom.run(models=[\"LR\", \"RF\"])\n</code></pre> <p></p> <p></p>"}, {"location": "user_guide/logging/#tracked-elements", "title": "Tracked elements", "text": "<p>Tags The runs are automatically tagged with the model's full name, the branch from which the model was trained, and the time it took to fit the model. Add additional custom tags through the <code>ht_params</code> parameter, e.g.,  <code>atom.run([\"LR\", \"RF\"], ht_params={\"tags\": {\"tag1\": 1}})</code>.</p> <p>Parameters All parameters used by the estimator at initialization are tracked. Additional parameters passed to the fit method are not tracked.</p> <p>Model The model's estimator is stored as artifact. The estimator has to be compatible with the mlflow.sklearn, module.</p> <p>Hyperparameter tuning If hyperparameter tuning is performed, every trial is tracked as a nested run in the model's main run. This option can be switched off using atom's <code>log_ht</code> attribute, e.g., <code>atom.log_ht = False</code>. The data and pipeline options are never stored within nested runs.</p> <p>Metrics All metric results are tracked, not only during training, but also when the evaluate method is called at a later point. Metrics calculated during in-training validation are also stored.</p> <p>Dataset The train and test sets used to fit and evaluate the model can be stored as <code>.csv</code> files to the run's artifacts. This option can be switched on using atom's <code>log_data</code> attribute, e.g. <code>atom.log_data = True</code>.</p> <p>Pipeline The model's pipeline (returned from the export_pipeline method) can be stored as an artifact. This option can be switched on using atom's <code>log_pipeline</code> attribute, e.g., <code>atom.log_pipeline = True</code>.</p> <p>Plots By default, plots are stored as <code>.html</code> artifacts in all runs corresponding to the models that are showed in the plot. If the <code>filename</code> parameter is specified, they are stored under that name, else the method's name is used. This option can be switched off using atom's <code>log_plots</code> attribute, e.g., <code>atom.log_plots = False</code>.</p>"}, {"location": "user_guide/models/", "title": "Models", "text": ""}, {"location": "user_guide/models/#predefined-models", "title": "Predefined models", "text": "<p>ATOM provides many models for classification and regression tasks that can be used to fit the data in the pipeline. After fitting, a class containing the underlying estimator is attached to atom as an attribute. We refer to these \"subclasses\" as models. Apart from the estimator, the models contain a variety of attributes and methods that can help you understand how the underlying estimator performed. They can be accessed using their acronyms, e.g., <code>atom.LGB</code> to access the LightGBM model. The available models and their corresponding acronyms are:</p> <ul> <li>AdaBoost (AdaB)</li> <li>ARIMA (Arima)</li> <li>AutoARIMA (AutoARIMA)</li> <li>AutomaticRelevanceDetermination (ARD)</li> <li>Bagging (Bag)</li> <li>BayesianRidge (BR)</li> <li>BernoulliNB (BNB)</li> <li>CatBoost (CatB)</li> <li>CategoricalNB (CatNB)</li> <li>ComplementNB (CNB)</li> <li>DecisionTree (Tree)</li> <li>Dummy (Dummy)</li> <li>ElasticNet (EN)</li> <li>ETS (ETS)</li> <li>ExponentialSmoothing (ES)</li> <li>ExtraTree (ETree)</li> <li>ExtraTrees (ET)</li> <li>GaussianNB (GNB)</li> <li>GaussianProcess (GP)</li> <li>GradientBoostingMachine (GBM)</li> <li>HuberRegression (Huber)</li> <li>HistGradientBoosting (hGBM)</li> <li>KNearestNeighbors (KNN)</li> <li>Lasso (Lasso)</li> <li>LeastAngleRegression (Lars)</li> <li>LightGBM (LGB)</li> <li>LinearDiscriminantAnalysis (LDA)</li> <li>LinearSVM (lSVM)</li> <li>LogisticRegression (LR)</li> <li>MultiLayerPerceptron (MLP)</li> <li>MultinomialNB (MNB)</li> <li>NaiveForecaster (NF)</li> <li>OrdinaryLeastSquares (OLS)</li> <li>OrthogonalMatchingPursuit (OMP)</li> <li>PassiveAggressive (PA)</li> <li>Perceptron (Perc)</li> <li>PolynomialTrend (PT)</li> <li>QuadraticDiscriminantAnalysis (QDA)</li> <li>RadiusNearestNeighbors (RNN)</li> <li>RandomForest (RF)</li> <li>Ridge (Ridge)</li> <li>StochasticGradientDescent (SGD)</li> <li>SupportVectorMachine (SVM)</li> <li>XGBoost (XGB)</li> </ul> <p>Warning</p> <p>The model classes can not be initialized directly by the user! Use them only through atom.</p> <p>Tip</p> <p>The acronyms are case-insensitive, e.g., <code>atom.lgb</code> also calls the LightGBM model.</p> <p></p>"}, {"location": "user_guide/models/#custom-models", "title": "Custom models", "text": "<p>It is also possible to create your own models in ATOM's pipeline. For example, imagine we want to use sklearn's RANSACRegressor estimator (note that is not included in ATOM's predefined models). There are two ways to achieve this:</p> <ul> <li>Using ATOMModel (recommended). With this approach you can pass   the required model characteristics to the pipeline.</li> </ul> <pre><code>&gt;&gt;&gt; from atom import ATOMRegressor, ATOMModel\n&gt;&gt;&gt; from sklearn.datasets import load_diabetes\n&gt;&gt;&gt; from sklearn.linear_model import RANSACRegressor\n\n&gt;&gt;&gt; ransac = ATOMModel(RANSACRegressor, name=\"RANSAC\", needs_scaling=True)\n\n&gt;&gt;&gt; X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y)\n&gt;&gt;&gt; atom.run(ransac)\n</code></pre> <ul> <li>Using the estimator's class or an instance of the class. This approach   will also call ATOMModel under the hood, but it will leave its   parameters to their default values.</li> </ul> <pre><code>&gt;&gt;&gt; from atom import ATOMRegressor\n&gt;&gt;&gt; from sklearn.datasets import load_diabetes\n&gt;&gt;&gt; from sklearn.linear_model import RANSACRegressor\n\n&gt;&gt;&gt; X, y = load_diabetes(return_X_y=True, as_frame=True)\n\n&gt;&gt;&gt; atom = ATOMRegressor(X, y)\n&gt;&gt;&gt; atom.run(RANSACRegressor)\n</code></pre> <p>Additional things to take into account:</p> <ul> <li>Custom models can be accessed through their acronym like any other model, e.g.   <code>atom.ransac</code> in the example above.</li> <li>Custom models are not restricted to sklearn estimators, but they should   follow sklearn's API, i.e., have a fit and predict method.</li> <li>Parameter customization (for the initializer) is only possible for   custom models which provide an estimator that has a <code>set_params()</code> method,   i.e., it's a child class of BaseEstimator.</li> <li>Hyperparameter tuning for custom models is ignored unless appropriate   dimensions are provided through <code>ht_params</code>.</li> </ul> <p></p>"}, {"location": "user_guide/models/#deep-learning", "title": "Deep learning", "text": "<p>Deep learning models can be used through ATOM's custom models as long as they follow sklearn's API. For example, models implemented with the Keras package should use the scikeras wrappers KerasClassifier or KerasRegressor.</p> <p>Many deep learning use cases, for example in computer vision, use datasets with more than 2 dimensions, e.g., image data can have shape (n_samples, length, width, rgb). Luckily, scikeras has a workaround to be able to work with such datasets. Learn with this example how to use ATOM to train and validate a Convolutional Neural Network on an image dataset.</p> <p>Warning</p> <p>Models implemented with keras can only use custom hyperparameter tuning when <code>n_jobs=1</code> or <code>ht_params={\"cv\": 1}</code>. Using n_jobs &gt; 1 and cv &gt; 1 raises a PicklingError due to incompatibilities of the APIs.</p> <p></p>"}, {"location": "user_guide/models/#ensembles", "title": "Ensembles", "text": "<p>Ensemble models use multiple estimators to obtain better predictive performance than could be obtained from any of the constituent learning algorithms alone. ATOM implements two ensemble techniques: voting and stacking. Click here to see an example that uses ensemble models.</p> <p>If the ensemble's underlying estimator is a model that used automated feature scaling, it's added as a Pipeline containing the <code>scaler</code> and estimator. If a mlflow experiment is active, the ensembles start their own run, just like the predefined models do.</p> <p>Warning</p> <p>Combining models trained on different branches into one ensemble is not allowed and will raise an exception.</p>"}, {"location": "user_guide/models/#voting", "title": "Voting", "text": "<p>The idea behind voting is to combine the predictions of conceptually different models to make new predictions. Such a technique can be useful for a set of equally well performing models in order to balance out their individual weaknesses. Read more in sklearn's documentation.</p> <p>A voting model is created from a trainer through the voting method. The voting model is added automatically to the list of models in the trainer, under the <code>Vote</code> acronym. The underlying estimator is a custom adaptation of VotingClassifier or VotingRegressor depending on the task. The differences between ATOM's and sklearn's implementation are:</p> <ul> <li>ATOM's implementation doesn't fit estimators if they're already fitted.</li> <li>ATOM's instance is considered fitted at initialization when all underlying   estimators are.</li> <li>ATOM's VotingClassifier doesn't implement a LabelEncoder to encode the   target column.</li> </ul> <p>The two estimators are customized in this way to save time and computational resources, since the classes are always initialized with fitted estimators. As a consequence of this, the VotingClassifier can not use sklearn's build-in LabelEncoder for the target column since it can't be fitted when initializing the class. For the vast majority of use cases, the changes will have no effect. If you want to export the estimator and retrain it on different data, just make sure to clone the underlying estimators first.</p> <p></p>"}, {"location": "user_guide/models/#stacking", "title": "Stacking", "text": "<p>Stacking is a method for combining estimators to reduce their biases. More precisely, the predictions of each individual estimator are stacked together and used as input to a final estimator to compute the prediction. Read more in sklearn's documentation.</p> <p>A stacking model is created from a trainer through the stacking method. The stacking model is added automatically to the list of models in the trainer, under the <code>Stack</code> acronym. The underlying estimator is a custom adaptation of StackingClassifier or StackingRegressor depending on the task. The only difference between ATOM's and sklearn's implementation is that ATOM's implementation doesn't fit estimators if they're already fitted. The two estimators are customized in this way to save time and computational resources, since the classes are always initialized with fitted estimators. For the vast majority of use cases, the changes will have no effect. If you want to export the estimator and retrain it on different data, just make sure to clone the underlying estimators first.</p>"}, {"location": "user_guide/nlp/", "title": "Natural Language Processing", "text": "<p>Natural Language Processing (NLP) is the subfield of machine learning that works with human language data. The nlp module contains four classes that help to convert raw text to meaningful numeric values, ready to be ingested by a model. ATOM uses the nltk library for the majority of its NLP processes.</p> <p>The text documents are expected to be provided in a column of the dataframe named <code>corpus</code> (the name is case-insensitive). Only the corpus is changed by the transformers, leaving the rest of the columns as is. This mechanism allows atom to combine datasets containing a text corpus with other non-text features. If an array is provided as input, it should consist of only one feature containing the text (one document per row). ATOM will then automatically convert the array to a dataframe with the desired column name. Documents are expected to be strings or sequences of words. Click here for an example using text data.</p> <p>Note</p> <p>All of atom's NLP methods automatically adopt the relevant transformer attributes (<code>verbose</code>, <code>logger</code>) from atom. A different choice can be added as parameter to the method call, e.g., <code>atom.tokenize(verbose=0)</code>.</p> <p>Info</p> <p>ATOM doesn't do topic modeling! The module's goal is to help process text documents into features that can be used for supervised learning.</p> <p></p>"}, {"location": "user_guide/nlp/#text-cleaning", "title": "Text cleaning", "text": "<p>Text data is rarely clean. Whether it's scraped from a website or inferred from paper documents, it's always populated with irrelevant information for the model, such as email addresses, HTML tags, numbers or punctuation marks. Use the TextCleaner class to clean the corpus from such noise. It can be accessed from atom through the textclean method. Use the class' parameters to choose which transformations to perform. The available steps are:</p> <ul> <li>Decode unicode characters to their ascii representations.</li> <li>Convert all characters to lower case.</li> <li>Drop email addresses from the text.</li> <li>Drop URL links from the text.</li> <li>Drop HTML tags from the text.</li> <li>Drop emojis from the text.</li> <li>Drop numbers from the text.</li> <li>Drop punctuations from the text.</li> </ul> <p></p>"}, {"location": "user_guide/nlp/#tokenization", "title": "Tokenization", "text": "<p>Some text processing algorithms, like stemming or lemmatization, require the corpus to be made out of tokens, instead of strings, in order to know what to consider as words. Tokenization is used to achieve this. It separates every document into a sequence of smaller units. In this case, the words.</p> <p>Sometimes, words have a different meaning on their own than when combined with adjacent words. For example, the word <code>new</code> has a completely different meaning when the word <code>york</code> is directly after it than when it's not. These combinations of two words are called bigrams. When there are three words, they are called trigrams, and with four words quadgrams.</p> <p>The Tokenizer class converts a document into a sequence of words, and can create the most frequent bigrams, trigrams and quadgrams. It can be accessed from atom through the tokenize method.</p> <p></p>"}, {"location": "user_guide/nlp/#text-normalization", "title": "Text Normalization", "text": "<p>Normalization for texts is a process that converts a list of words to a more uniform standard. This is useful to reduce the amount of different information that the computer has to deal with, and therefore improves efficiency. The goal of normalization techniques like stemming and lemmatization is to reduce inflectional and related forms of a word to a common base form.</p> <p>Normalize the words in the corpus using the TextNormalizer class. It can be accessed from atom through the textnormalize method.</p> <p></p>"}, {"location": "user_guide/nlp/#vectorization", "title": "Vectorization", "text": "<p>Text data cannot be fed directly to the algorithms themselves, as most of them expect numerical feature vectors with a fixed size, rather than words in the text documents with variable length. Vectorization is the general process of turning a collection of text documents into numerical feature vectors. You can apply it to the corpus using the Vectorizer class. It can be accessed from atom through the vectorize method.</p> <p>Info</p> <p>All strategies can utilize GPU speed-up. Click here for further information about GPU acceleration.</p> <p></p> <p>Bag of Words The Bag of Words (BOW) strategy applies tokenization, counting and normalization to the corpus. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document. The created columns are named with the words they are embedding with the prefix <code>corpus_</code>. Read more in sklearn's documentation.</p> <p></p> <p>TF-IDF In a large text corpus, some words will be very present (e.g., \u201cthe\u201d, \u201ca\u201d, \u201cis\u201d in English), hence carrying very little meaningful information about the actual contents of the document. If we were to feed the direct count data directly to a classifier, those very frequent terms would shadow the frequencies of rarer, yet more interesting, terms. Use the TF-IDF strategy to re-weight the count features into floating point values. The created columns are named with the words they are embedding with the prefix <code>corpus_</code>. Read more in sklearn's documentation.</p> <p></p> <p>Hashing The larger the corpus, the larger the vocabulary will grow and thus increasing the number of features and memory use. Use the Hashing strategy to hash the words to a specified number of features. The created features are named <code>hash0</code>, <code>hash1</code>, etc... Read more in sklearn's documentation.</p>"}, {"location": "user_guide/nomenclature/", "title": "Nomenclature", "text": "<p>This documentation consistently uses terms to refer to certain concepts related to this package. The most frequent terms are described hereunder.</p> <p></p> ATOM <p>Refers to this package.</p> atom <p>Instance of the ATOMClassifier, ATOMForecaster or ATOMRegressor classes (note that the examples use it as the default variable name).</p> <p>A pipeline, corresponding dataset and models fitted to that dataset. See the branches section of the user guide.</p> categorical columns <p>Refers to all columns of type <code>object</code> or <code>category</code>.</p> class <p>Unique value in a column, e.g., a binary classifier has 2 classes in the target column.</p> dataframe <p>Two-dimensional, size-mutable, potentially heterogeneous tabular data of type pd.DataFrame or its modin counterpart.</p> dataframe-like <p>Any type object from which a dataframe can be created. This includes an iterable, a dict whose values are 1d-arrays, a two-dimensional list, tuple, np.ndarray or sps.csr_matrix, and most commonly, a dataframe. This is the standard input format for any dataset.</p> <p>Additionally, you can provide a callable whose output is any of the aforementioned types. This is useful when the dataset is very large and you are performing parallel operations, since it can avoid broadcasting a large dataset from the driver to the workers.</p> estimator <p>An object which manages the estimation and decoding of an algorithm. The algorithm is estimated as a deterministic function of a set of parameters, a dataset and a random state. Should implement a <code>fit</code> method. Often used interchangeably with predictor because of user preference.</p> index <p>Immutable sequence used for indexing and alignment of type pd.Index or their modin counterpart.</p> missing values <p>All values in the <code>missing</code> attribute, as well as <code>None</code>, <code>NaN</code>, <code>+inf</code> and <code>-inf</code>.</p> model <p>Instance of a model in atom. Not to confuse with estimator.</p> outliers <p>Sample that contains one or more outlier values. Note that the Pruner class can use a different definition for outliers depending on the chosen strategy.</p> outlier value <p>Value that lies further than 3 times the standard deviation away from the mean of its column, i.e., |z-score| &gt; 3.</p> predictor <p>An estimator implementing a <code>predict</code> method.</p> scorer <p>A non-estimator callable object which evaluates an estimator on given test data, returning a number. Unlike evaluation metrics, a greater returned number must correspond with a better score. See sklearn's documentation.</p> segment <p>Subset (segment) of a sequence, whether through slicing or generating a range of values. When given as a parameter type, it includes both range and slice.</p> sequence <p>A one-dimensional, indexable array of type sequence (except string), np.ndarray, index or series. This is the standard input format for a dataset's target column.</p> series <p>One-dimensional ndarray with axis labels of type pd.Series or its modin counterpart.</p> target <p>The dependent variable in a supervised learning task. Passed as <code>y</code> to an estimator's fit method.</p> task <p>One of the supervised machine learning approaches that ATOM supports:</p> <ul> <li>binary classification</li> <li>multiclass classification</li> <li>multilabel classification</li> <li>multiclass-multioutput classification</li> <li>regression</li> <li>multioutput regression</li> <li>univariate forecast</li> <li>multivariate forecast</li> </ul> transformer <p>An estimator implementing a <code>transform</code> method. This encompasses all data cleaning and feature engineering classes.</p>"}, {"location": "user_guide/plots/", "title": "Plots", "text": "<p>ATOM provides many plotting methods to analyze the data or compare the model performances. Descriptions and examples can be found in the API section. ATOM mainly uses the plotly library for plotting. Plotly makes interactive, publication-quality graphs that are rendered using html. Some plots require other libraries like matplotlib, shap, wordcloud and schemdraw.</p> <p>Plots that compare model performances (methods with the <code>models</code> parameter) can be called directly from atom, e.g., <code>atom.plot_roc()</code>, or from one of the models, e.g., <code>atom.adab.plot_roc()</code>. If called from atom, use the <code>models</code> parameter to specify which models to plot. If called from a specific model, it makes the plot only for that model and the <code>models</code> parameter becomes unavailable.</p> <p>Plots that analyze the data (methods without the <code>models</code> parameter) can only be called from atom, and not from the models.</p> <p></p>"}, {"location": "user_guide/plots/#parameters", "title": "Parameters", "text": "<p>Apart from the plot-specific parameters, all plots have five parameters in common:</p> <ul> <li>The <code>title</code> parameter adds a title to the plot. The default value doesn't   show any title. Provide a configuration (as dictionary) to customize its   appearance, e.g., <code>title=dict(text=\"Awesome plot\", color=\"red\")</code>.   Read more in plotly's documentation.</li> <li> <p>The <code>legend</code> parameter is used to show/hide, position or customize the   plot's legend. Provide a configuration (as dictionary) to customize its   appearance (e.g., <code>legend=dict(title=\"Title for legend\", title_font_color=\"red\")</code>)   or choose one of the following locations:</p> <ul> <li>upper left</li> <li>upper right</li> <li>lower left</li> <li>lower right</li> <li>upper center</li> <li>lower center</li> <li>center left</li> <li>center right</li> <li>center</li> <li>out: Position the legend outside the axis, on the right hand side. This   is plotly's default position. Note that this shrinks the size of the axis   to fit both legend and axes in the specified <code>figsize</code>.</li> </ul> </li> <li> <p>The <code>figsize</code> parameter adjust the plot's size.</p> </li> <li>The <code>filename</code> parameter is used to save the plot.</li> <li>The <code>display</code> parameter determines whether to show or return the plot.</li> </ul> <p>Info</p> <p>In some plotting methods, it's possible to plot separate lines for different subsets of the rows. For example, to compare the results on the train and test set. For these cases, either provide a sequence to the <code>rows</code> parameter for every line you want to draw, e.g., <code>atom.plot_roc(rows=(\"train\", \"test\"))</code>, or provide a dictionary where the keys are the names of the sets (used in the legend) and the values are the corresponding selection of rows, selected using any of the aforementioned approaches, e.g, <code>atom.plot_roc(rows={\"0-99\": range(100), \"100-199\": range(100, 200})</code>. Note that for these methods, using <code>atom.plot_roc(rows=\"train+test\")</code>, only plots one line with the data from both sets. See the advanced plotting example.</p> <p></p>"}, {"location": "user_guide/plots/#aesthetics", "title": "Aesthetics", "text": "<p>The plot's aesthetics can be customized using the plot attributes prior to calling the plotting method, e.g., <code>atom.title_fontsize = 30</code>. The default values are:</p> <ul> <li>palette: [\"rgb(0, 98, 98)\", \"rgb(56, 166, 165)\", \"rgb(115, 175, 72)\",   \"rgb(237, 173, 8)\", \"rgb(225, 124, 5)\", \"rgb(204, 80, 62)\", \"rgb(148, 52, 110)\",   \"rgb(111, 64, 112)\", \"rgb(102, 102, 102)\"]</li> <li>title_fontsize: 24</li> <li>label_fontsize: 16</li> <li>tick_fontsize: 12</li> </ul> <p>Use atom's update_layout method to further customize the plot's layout using any of plotly's layout properties, e.g., <code>atom.update_layout(template=\"plotly_dark\")</code>. Similarly, use the update_traces method to customize the traces properties, e.g. <code>atom.update_traces(mode=\"lines+markers\")</code>.</p> <p>The reset_aesthetics method allows you to reset all aesthetics to their default value. See advanced plotting for an example.</p> <p></p>"}, {"location": "user_guide/plots/#canvas", "title": "Canvas", "text": "<p>Use the canvas method to draw multiple plots side by side, for example to make it easier to compare similar results. The canvas method is a <code>@contextmanager</code>, i.e., it's used through Python's <code>with</code> command. Plots in a canvas ignore the legend, figsize, filename and display parameters. Instead, specify these parameters in the canvas. If a variable is assigned to the canvas (e.g., <code>with atom.canvas() as fig</code>), it yields the resulting figure.</p> <p>For example, we can use a canvas to compare the results of a XGBoost and LightGBM model on the train and test set. We could also draw the lines for both models in the same axes, but that would clutter the plot too much. Click here for more examples.</p> <pre><code>&gt;&gt;&gt; from atom import ATOMClassifier\n&gt;&gt;&gt; from sklearn.datasets import make_classification\n\n&gt;&gt;&gt; X, y = make_classification(n_samples=1000, flip_y=0.2, random_state=1)\n\n&gt;&gt;&gt; atom = ATOMClassifier(X, y)\n&gt;&gt;&gt; atom.run([\"XGB\", \"LGB\"])\n\n&gt;&gt;&gt; with atom.canvas(2, 2, title=\"XGBoost vs LightGBM\"):\n...     atom.xgb.plot_roc(rows=\"train+test\", title=\"ROC - XGBoost\")\n...     atom.lgb.plot_roc(rows=\"train+test\", title=\"ROC - LightGBM\")\n...     atom.xgb.plot_prc(rows=\"train+test\", title=\"PRC - XGBoost\")\n...     atom.lgb.plot_prc(rows=\"train+test\", title=\"PRC - LightGBM\")\n</code></pre> <p></p>"}, {"location": "user_guide/plots/#shap", "title": "SHAP", "text": "<p>The SHAP (SHapley Additive exPlanations) python package uses a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions. ATOM implements methods to plot 7 of SHAP's plotting functions directly from its API. A list of available shap plots can be found here.</p> <p>Calculating the Shapley values is computationally expensive, especially for model agnostic explainers like Permutation. To avoid having to recalculate the values for every plot, ATOM stores the shapley values internally after the first calculation, and access them later when needed again.</p> <p>Note</p> <p>Since the plot figures are not made by ATOM, note the following:</p> <ul> <li>It's not possible to draw multiple models in the same figure.   Selecting more than one model will raise an exception. To avoid   this, call the plot directly from a model, e.g., <code>atom.lr.plot_shap_force()</code>.</li> <li>The returned plot is a matplotlib figure, not plotly's.</li> </ul> <p></p>"}, {"location": "user_guide/plots/#available-plots", "title": "Available plots", "text": "<p>A list of available plots can be found hereunder. Note that not all plots can be called from every class and that their availability can depend on the task at hand.</p>"}, {"location": "user_guide/plots/#data-plots", "title": "Data plots", "text": "<p>plot_componentsPlot the explained variance ratio per component.plot_correlationPlot a correlation matrix.plot_distributionPlot column distributions.plot_ngramsPlot n-gram frequencies.plot_pcaPlot the explained variance ratio vs number of components.plot_qqPlot a quantile-quantile plot.plot_relationshipsPlot pairwise relationships in a dataset.plot_rfecvPlot the rfecv results.plot_wordcloudPlot a wordcloud from the corpus.</p>"}, {"location": "user_guide/plots/#hyperparameter-tuning-plots", "title": "Hyperparameter tuning plots", "text": "<p>plot_edfPlot the Empirical Distribution Function of a study.plot_hyperparameter_importancePlot a model's hyperparameter importance.plot_hyperparametersPlot hyperparameter relationships in a study.plot_parallel_coordinatePlot high-dimensional parameter relationships in a study.plot_pareto_frontPlot the Pareto front of a study.plot_slicePlot the parameter relationship in a study.plot_terminator_improvementPlot the potentials for future objective improvement.plot_timelinePlot the timeline of a study.plot_trialsPlot the hyperparameter tuning trials.</p>"}, {"location": "user_guide/plots/#prediction-plots", "title": "Prediction plots", "text": "<p>plot_calibrationPlot the calibration curve for a binary classifier.plot_confusion_matrixPlot a model's confusion matrix.plot_detPlot the Detection Error Tradeoff curve.plot_errorsPlot a model's prediction errors.plot_evalsPlot evaluation curves.plot_feature_importancePlot a model's feature importance.plot_forecastPlot a time series with model forecasts.plot_gainsPlot the cumulative gains curve.plot_learning_curvePlot the learning curve: score vs number of training samples.plot_liftPlot the lift curve.plot_parshapPlot the partial correlation of shap values.plot_partial_dependencePlot the partial dependence of features.plot_permutation_importancePlot the feature permutation importance of models.plot_pipelinePlot a diagram of the pipeline.plot_prcPlot the precision-recall curve.plot_probabilitiesPlot the probability distribution of the target classes.plot_residualsPlot a model's residuals.plot_resultsPlot the model results.plot_rocPlot the Receiver Operating Characteristics curve.plot_successive_halvingPlot scores per iteration of the successive halving.plot_thresholdPlot metric performances against threshold values.</p>"}, {"location": "user_guide/plots/#shap-plots", "title": "Shap plots", "text": "<p>plot_shap_barPlot SHAP's bar plot.plot_shap_beeswarmPlot SHAP's beeswarm plot.plot_shap_decisionPlot SHAP's decision plot.plot_shap_forcePlot SHAP's force plot.plot_shap_heatmapPlot SHAP's heatmap plot.plot_shap_scatterPlot SHAP's scatter plot.plot_shap_waterfallPlot SHAP's waterfall plot.</p>"}, {"location": "user_guide/predicting/", "title": "Predicting", "text": "<p>After training a model, you probably want to make predictions on new, unseen data. Just like a sklearn estimator, you can call the prediction methods from the model, e.g., <code>atom.tree.predict(X)</code>.</p> <p>All prediction methods transform the provided data through the pipeline in the model's branch before making the predictions. Transformers that should only be applied on the training set are excluded from this step (e.g., outlier pruning or class balancing).</p> <p>The available prediction methods are the standard methods for estimators in sklearn's and sktime's API.</p> <p>For classification and regression tasks:</p> <p>decision_functionGet confidence scores on new data or existing rows.predictGet predictions on new data or existing rows.predict_log_probaGet class log-probabilities on new data or existing rows.predict_probaGet class probabilities on new data or existing rows.scoreGet a metric score on new data.</p> <p>For forecast tasks:</p> <p>predictGet predictions on new data or existing rows.predict_intervalGet prediction intervals on new data or existing rows.predict_probaGet probabilistic forecasts on new data or existing rows.predict_quantilesGet probabilistic forecasts on new data or existing rows.predict_varGet probabilistic forecasts on new data or existing rows.scoreGet a metric score on new data.</p> <p>Warning</p> <p>The <code>score</code> method return atom's metric score, not the metric returned by sklearn/sktime's score method for estimators. Use the method's <code>metric</code> parameter to calculate a different metric.</p> <p>Note</p> <ul> <li>The output of ATOM's methods are pandas objects, not numpy arrays.</li> <li>The <code>predict_proba</code> method of some meta-estimators for multioutput tasks   (such as MultioutputClassifier) return 3 dimensions, namely, a list of   arrays with shape=(n_samples, n_classes). One array per target column. Since   ATOM's prediction methods return pandas objects, such 3-dimensional arrays   are converted to a multiindex pd.DataFrame, where the first level of the row   indices are the target columns, and the second level are the classes.</li> <li>The prediction results are cached after the first call to avoid consequent   expensive calculations. This mechanism can increase the size of the instance   for large datasets. Use the clear method to free the   memory.</li> </ul> <p>It's also possible to get the prediction for a specific row or rows in the dataset. See the row and column selection section in the user guide to learn how to select the rows, e.g., <code>atom.rf.predict(\"test\")</code> or <code>atom.rf.predict_proba(range(100))</code>.</p> <p>Note</p> <p>For forecast models, prediction on rows follow the ForecastingHorizon API. That means that using the row index works, but for example using <code>atom.arima.predict(1)</code> returns the prediction on the first row of the test set (instead of the second row of the train set).</p>"}, {"location": "user_guide/time_series/", "title": "Time series", "text": ""}, {"location": "user_guide/time_series/#forecast", "title": "Forecast", "text": ""}, {"location": "user_guide/time_series/#time-series-classification", "title": "Time series classification", "text": ""}, {"location": "user_guide/time_series/#time-series-regression", "title": "Time series regression", "text": ""}, {"location": "user_guide/training/", "title": "Training", "text": "<p>The training phase is where the models are fitted on the training data. After this, you can use the plots and prediction methods to evaluate the results. The training applies the following steps for all models:</p> <ol> <li>Use hyperparameter tuning to select the optimal hyperparameters for     the model (optional).</li> <li>The model is fitted on the training set using the best combination    of hyperparameters found. After that, the model is evaluated on the tes set.</li> <li>Calculate various scores on the test set using a bootstrap    algorithm (optional).</li> </ol> <p>There are three approaches to run the training.</p> <ul> <li>Direct training:<ul> <li>DirectClassifier</li> <li>DirectForecaster</li> <li>DirectRegressor</li> </ul> </li> <li>Training via successive halving:<ul> <li>SuccessiveHalvingClassifier</li> <li>SuccessiveHalvingForecaster</li> <li>SuccessiveHalvingRegressor</li> </ul> </li> <li>Training via train sizing:<ul> <li>TrainSizingClassifier</li> <li>TrainSizingForecaster</li> <li>TrainSizingRegressor</li> </ul> </li> </ul> <p>The direct fashion repeats the aforementioned steps only once, while the other two approaches repeats them more than once. Just like the data cleaning and feature engineering classes, it's discouraged to use these classes directly. Instead, every approach can be called directly from atom through the run, successive_halving and train_sizing methods respectively.</p> <p>Models are called through their acronyms, e.g., <code>atom.run(models=\"RF\")</code> will train a RandomForest. If you want to run the same model multiple times, add a tag after the acronym to differentiate them. the tag must be  separated from the accronym by an underscore.</p> <pre><code>atom.run(\n    models=[\"RF_1\", \"RF_2\"],\n    est_params={\n        \"RF_1\": {\"n_estimators\": 100},\n        \"RF_2\": {\"n_estimators\": 200},\n    }\n)\n</code></pre> <p>For example, this pipeline fits two Random Forest models, one with 100 and the other with 200 decision trees. The models can be accessed through <code>atom.rf_1</code> and <code>atom.rf_2</code>. Use tagged models to test how the same model performs when fitted with different parameters or on different data sets. See the Imbalanced datasets example.</p> <p>Additional things to take into account:</p> <ul> <li>If an exception is encountered while fitting an estimator, the   pipeline will automatically jump to the next model. The exceptions are   stored in the <code>errors</code> attribute. Note that when a model is skipped,   there is no model subclass for that estimator.</li> <li>When showing the final results, a <code>!</code> indicates the highest score   and a <code>~</code> indicates that the model is possibly overfitting (training   set has a score at least 20% higher than the test set).</li> </ul> <p></p>"}, {"location": "user_guide/training/#metric", "title": "Metric", "text": "<p>ATOM uses sklearn's scorers for model evaluation. A scorer consists of a metric function and some parameters that define the scorer's properties , such as if a higher or lower score is better (score or loss function) or if the function needs probability estimates or rounded predictions (see the make_scorer function). The <code>metric</code> parameter accepts three ways of defining the scorer:</p> <ul> <li>Using the name of one of the predefined scorers.</li> <li>Using a function with signature <code>function(y_true, y_pred) -&gt; score</code>.   In this case, ATOM uses make_scorer   with default parameters.</li> <li>Using a scorer object.</li> </ul> <p>Note that all scorers follow the convention that higher return values are better than lower return values. Thus, metrics which measure the distance between the model and the data (i.e., loss functions), like <code>max_error</code> or <code>mean_squared_error</code>, will return the negated value of the metric.</p> <p></p>"}, {"location": "user_guide/training/#predefined-scorers", "title": "Predefined scorers", "text": "<p>ATOM accepts all sklearn's scorers as well as some custom acronyms and custom scorers. Since some of sklearn's scorers have quite long names and ATOM is all about lazyfast experimentation, the package provides acronyms for some of the most commonly used ones. These acronyms are case-insensitive and can be used in the <code>metric</code> parameter instead of the scorer's full name, e.g., <code>atom.run(\"LR\", metric=\"BA\")</code> uses <code>balanced_accuracy</code>. The available acronyms are:</p> <ul> <li>\"AP\" for \"average_precision\"</li> <li>\"BA\" for \"balanced_accuracy\"</li> <li>\"AUC\" for \"roc_auc\"</li> <li>\"LogLoss\" for \"neg_log_loss\"</li> <li>\"EV\" for \"explained_variance\"</li> <li>\"ME\" for \"max_error\"</li> <li>\"MAE\" for \"neg_mean_absolute_error\"</li> <li>\"MSE\" for \"neg_mean_squared_error\"</li> <li>\"RMSE\" for \"neg_root_mean_squared_error\"</li> <li>\"MSLE\" for \"neg_mean_squared_log_error\"</li> <li>\"MEDAE\" for \"neg_median_absolute_error\"</li> <li>\"MAPE\" for \"neg_mean_absolute_percentage_error\"</li> <li>\"POISSON\" for \"neg_mean_poisson_deviance\"</li> <li>\"GAMMA\" for \"neg_mean_gamma_deviance\"</li> </ul> <p>ATOM also provides some extra common metrics for binary classification tasks. </p> <ul> <li>\"TN\" for True Negatives</li> <li>\"FP\" for False Positives</li> <li>\"FN\" for False Negatives</li> <li>\"TP\" for True Positives</li> <li>\"FPR\" for False Positive rate (fall-out)</li> <li>\"TPR\" for True Positive Rate (sensitivity, recall)</li> <li>\"TNR\" for True Negative Rate (specificity)</li> <li>\"FNR\" for False Negative Rate (miss rate)</li> <li>\"MCC\" for Matthews Correlation Coefficient (also for multiclass classification)</li> </ul> <p></p>"}, {"location": "user_guide/training/#multi-metric-runs", "title": "Multi-metric runs", "text": "<p>Sometimes it is useful to measure the performance of the models in more than one way. ATOM lets you run the pipeline with multiple metrics at the same time. To do so, provide the <code>metric</code> parameter with a list of desired metrics, e.g., <code>atom.run(\"LDA\", metric=[\"r2\", \"mse\"])</code>.</p> <p>When fitting multi-metric runs, the resulting scores will return a list of metrics. For example, if you provided three metrics to the pipeline, <code>atom.knn.score_train</code> could return [0.8734, 0.6672, 0.9001]. Only the first metric of a multi-metric run (this metric is called the main metric) is used to select the winning model.</p> <p>Info</p> <ul> <li>The <code>winning</code> model is retrieved comparing only   the main metric.</li> <li>Some plots let you choose which of the metrics in a multi-metric run   to show using the <code>metric</code> parameter, e.g., plot_results.</li> </ul> <p></p>"}, {"location": "user_guide/training/#automated-feature-scaling", "title": "Automated feature scaling", "text": "<p>Models that require feature scaling will automatically do so before training, unless the data is sparse or already scaled. The data is considered scaled if it has one of the following prerequisites:</p> <ul> <li>The mean value over the mean of all columns lies between -0.05 and 0.05   and the mean of the standard deviation over all columns lies between 0.85   and 1.15. Categorical and binary columns (only 0s and 1s) are excluded   from the calculation.</li> <li>There is a transformer in the pipeline whose __name__ contains the   word <code>scaler</code>.</li> </ul> <p>The scaling is applied using a Scaler with default parameters. It can be accessed from the model through the <code>scaler</code> attribute. The scaled dataset can be examined through the model's data attributes. Use the available_models method to see which models require feature scaling. See here an example.</p> <p></p>"}, {"location": "user_guide/training/#in-training-validation", "title": "In-training validation", "text": "<p>Some predefined models allow in-training validation. This means that the estimator is evaluated (using only the main metric) on the train and test set after every round of the training (a round can be an iteration for linear models or an added tree for boosted tree models). The validation scores are stored in the <code>evals</code> attribute, a dictionary of the train and test performances per round (also when pruning isn't applied). Click here for an example using in-training validation.</p> <p>The predefined models that support in-training validation are:</p> <ul> <li>CatBoost</li> <li>LightGBM</li> <li>MultiLayerPerceptron</li> <li>PassiveAggressive</li> <li>Perceptron</li> <li>StochasticGradientDescent</li> <li>XGBoost</li> </ul> <p>To apply in-training validation to a custom model, use the <code>has_validation</code> parameter when creating the custom model.</p> <p>Warning</p> <ul> <li>In-training validation is not calculated during hyperparameter tuning.</li> <li>CatBoost selects the weights achieved by the best evaluation on the test set after training. This means that, by default, there is some minor data leakage in the test set. Use the <code>use_best_model=False</code> parameter to avoid this behavior or use a holdout set to evaluate the final estimator.</li> </ul> <p>Tip</p> <p>Use the plot_evals method to visualize the in-training validation on the train and test sets.</p> <p></p>"}, {"location": "user_guide/training/#parameter-customization", "title": "Parameter customization", "text": "<p>By default, every estimator uses the default parameters they get from their respective packages. To select different ones, use the <code>est_params</code>. parameter of the run method. There are two ways to add custom parameters to the models: adding them directly to the dictionary as key-value pairs or through dictionaries.</p> <p>Adding the parameters directly to <code>est_params</code> (or using a dict with the key 'all') shares them across all models in the trainer. In this example, both the XGBoost and the LightGBM model use 200 boosted trees. Make sure all the models do have the specified parameters or an exception will be raised!</p> <pre><code>atom.run(models=[\"XGB\", \"LGB\"], est_params={\"n_estimators\": 200})\n</code></pre> <p>To specify parameters per model, use the model name as key and a dict of the parameters as value. In this example, the XGBoost model uses <code>n_estimators=200</code> and the MultiLayerPerceptron uses one hidden layer with 75 neurons.</p> <pre><code>atom.run(\n    models=[\"XGB\", \"MLP\"],\n    est_params={\n        \"XGB\": {\"n_estimators\": 200},\n        \"MLP\": {\"hidden_layer_sizes\": (75,)},\n    }\n)\n</code></pre> <p>Some estimators allow you to pass extra parameters to the fit method (besides X and y). This can be done adding <code>_fit</code> at the end of the parameter. For example, to change XGBoost's verbosity, we can run:</p> <pre><code>atom.run(models=\"XGB\", est_params={\"verbose_fit\": True})\n</code></pre> <p>Note</p> <p>If a parameter is specified through <code>est_params</code>, it's ignored by the study, even if it's added manually to <code>ht_params[\"distributions\"]</code>.</p> <p>Info</p> <p>The estimator's <code>n_jobs</code> and <code>random_state</code> parameters adopt atom's values (when available), unless specified through <code>est_params</code>.</p> <p></p>"}, {"location": "user_guide/training/#hyperparameter-tuning", "title": "Hyperparameter tuning", "text": "<p>In order to achieve maximum performance, it's important to tune an estimator's hyperparameters before training it. ATOM provides hyperparameter tuning through the optuna package. Just like optuna, we use the terms <code>study</code> and <code>trial</code> as follows:</p> <ul> <li>Study: optimization based on an objective function.</li> <li>Trial: a single execution of the objective function.</li> </ul> <p>Each trial is either computed by cross-validation on the complete training set or by randomly splitting the training set every iteration into a (sub)training and validation set. This process can create some minimum data leakage towards specific parameters (since the estimator is evaluated on data that is used to train the next estimator), but it ensures maximal use of the provided data. However, the leakage is not present in the independent test set, thus the final score of every model is unbiased. Note that, if the dataset is relatively small, the tuning's best score can consistently be lower than the final score on the test set due to the considerable lower fraction of instances on which it is trained. After finishing the study, the parameters that resulted in the best score are used to fit the final model on the complete training set.</p> <p>Info</p> <ul> <li>Unless specified differently by the user, the used samplers   are TPESampler   for single-metric runs and NSGAIISampler   for multi-metric runs.</li> <li>For multi-metric runs, the selected best trial   is the trial that performed best on the main metric. Use the property's   <code>@setter</code> to change it to any other trial. See the hyperparameter tuning   example.</li> </ul> <p>There are many possibilities to tune the study to your liking. The main parameter is <code>n_trials</code>, which determine the number of trials that are performed.</p> <p>Extra things to take into account:</p> <ul> <li>The train/validation splits are different per trial but equal for all models.</li> <li>Re-evaluating the objective function at the same point (with the same   hyperparameters) automatically skips the calculation and returns the   same score as the equivalent trial.</li> </ul> <p>Tip</p> <p>The hyperparameter tuning output can become quite wide for models with many hyperparameters. If you are working in a Jupyter Notebook, you can change the output's width running the following code in a cell: <pre><code>from IPython.display import display, HTML\ndisplay(HTML(\"&lt;style&gt;.container { width:100% !important; }&lt;/style&gt;\"))\n</code></pre></p> <p>Other settings can be changed through the <code>ht_params</code> parameter, a dictionary where every key-value combination can be used to further customize the optimization.</p> <p>By default, which hyperparameters are tuned and their corresponding distributions are predefined by ATOM. Use the 'distributions' key to customize these. Just like with <code>est_params</code>, it's possible to share the same parameters across models or use a dictionary with the model name as key to specify the parameters for every individual model. Use the key 'all' to tune some hyperparameters for all models when you also want to tune other parameters only for specific ones. The following example tunes the <code>n_estimators</code> parameter for both models but the <code>max_depth</code> parameter only for the RandomForest.</p> <pre><code>atom.run(\n    models=[\"ET\", \"RF\"],\n    n_trials=30,\n    ht_params={\"distributions\": {\"all\": \"n_estimators\", \"RF\": \"max_depth\"}},\n)\n</code></pre> <p>Like the <code>columns</code> parameter in atom's methods, you can exclude parameters from the optimization adding <code>!</code> before its name. It's possible to exclude multiple parameters, but not to combine inclusion and exclusion for the same model. For example, to optimize a RandomForest using all its predefined parameters except <code>n_estimators</code>, run:</p> <pre><code>atom.run(\n    models=\"ET\",\n    n_trials=15,\n    ht_params={\"distributions\": \"!n_estimators\"},\n)\n</code></pre> <p>If just the parameter name is provided, the predefined distribution is used. It's also possible to provide custom distributions spaces, but make sure they are compliant with optuna's API. See every model's individual documentation in ATOM's API section for an overview of their hyperparameters and distributions.</p> <pre><code>from optuna.distributions import (\n    IntDistribution, FloatDistribution, CategoricalDistribution\n)\n\natom.run(\n    models=[\"ET\", \"RF\"],\n    n_trials=30,\n    ht_params={\n        \"dimensions\": {\n            \"all\": {\"n_estimators\": IntDistribution(10, 100, step=10)},\n            \"RF\": {\n                \"max_depth\": IntDistribution(1, 10),\n                \"max_features\": CategoricalDistribution([\"sqrt\", \"log2\"]),\n           },\n        },\n    }\n)\n</code></pre> <p>Parameters for optuna's study and the study's optimize method can be added as kwargs to <code>ht_params</code>. For example, to use a different sampler or add a custom callback.</p> <pre><code>from optuna.samplers import RandomSampler\n\natom.run(\n    models=\"LR\",\n    n_trials=30,\n    ht_params={\n        \"sampler\": RandomSampler(seed=atom.random_state),\n        \"callbacks\": custom_callback(),\n    },\n)\n</code></pre> <p>Note</p> <ul> <li>If you use the default sampler, it\u2019s recommended to consider setting   larger <code>n_trials</code> to make full use of the characteristics of TPESampler   because TPESampler uses some (by default, 10) trials for its startup.</li> <li>When specifying distributions manually, make sure to import the   distribution types from optuna: <code>from optuna.distributions import ...</code>.</li> </ul> <p>Warning</p> <p>Keras' models can only use hyperparameter tuning when <code>n_jobs=1</code> or <code>ht_params={\"cv\": 1}</code>. Using n_jobs &gt; 1 and cv &gt; 1 raises a PicklingError due to incompatibilities of the APIs. Read here more about deep learning models.</p> <p>Tip</p> <p>ATOM has several plots that can help you examine a model's study and trials. Have a look at them here.</p> <p></p>"}, {"location": "user_guide/training/#pruning", "title": "Pruning", "text": "<p>During hyperparameter tuning, pruning stops unpromising trials at the early stages of the training (a.k.a., automated early-stopping). This can save the pipeline much time that would otherwise be wasted on an estimator that is unlikely to yield the best results. A pruned trial can't be selected as <code>best_trial</code>. Click here to see an example that uses pruning.</p> <p>The study uses MedianPruner as default pruner. You can use any other of optuna's pruners through the <code>ht_params</code> parameter.</p> <pre><code>from optuna.pruners import HyperbandPruner\n\natom.run(\"SGD\", n_trials=30, ht_params={\"pruner\": HyperbandPruner()})\n</code></pre> <p>Warning</p> <ul> <li>Pruning is disabled for multi-metric runs.</li> <li>Pruning is only available for models that support in-training validation.</li> </ul> <p></p>"}, {"location": "user_guide/training/#bootstrapping", "title": "Bootstrapping", "text": "<p>After fitting the estimator, you can assess the robustness of the model using the bootstrap technique. This technique creates several new data sets selecting random  samples from the training set (with replacement) and evaluates them on  the test set. This way you can get a distribution of the performance of the model. The sets are the same for every model. The number of sets can be chosen through the <code>n_bootstrap</code> parameter.</p> <p>Tip</p> <p>Use the plot_results method to plot the boostrap scores in a boxplot.</p> <p></p>"}, {"location": "user_guide/training/#successive-halving", "title": "Successive halving", "text": "<p>Successive halving is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, we recommend only to use this technique with similar models, e.g., only using tree-based models.</p> <p>Run successive halving from atom via the successive_halving method. Consecutive runs of the same model are saved with the model's acronym followed by the number of models in the run. For example, a RandomForest in a run with 4 models would become model <code>RF4</code>.</p> <p>See here a successive halving example.</p> <p>Tip</p> <p>Use the plot_successive_halving method to see every model's performance per iteration of the successive halving.</p> <p></p>"}, {"location": "user_guide/training/#train-sizing", "title": "Train sizing", "text": "<p>When training models, there is usually a trade-off between model performance and computation time, that is regulated by the number of samples in the training set. Train sizing can be used to create insights in this trade-off, and help determine the optimal size of the training set. The models are fitted multiple times, ever-increasing the number of samples in the training set.</p> <p>Run train sizing from atom via the train_sizing method. The number of iterations and the number of samples per training can be specified with the <code>train_sizes</code> parameter. Consecutive runs of the same model are saved with the model's acronym followed by the fraction of rows in the training set (the <code>.</code> is removed from the fraction!). For example, a RandomForest in a run with 80% of the training samples would become model <code>RF08</code>.</p> <p>See here a train sizing example.</p> <p>Tip</p> <p>Use the plot_learning_curve method to see the model's performance per size of the training set.</p>"}]}
diff --git a/docs_sources/api/plots/plot_acf.md b/docs_sources/api/plots/plot_acf.md
new file mode 100644
index 000000000..01a4b44c8
--- /dev/null
+++ b/docs_sources/api/plots/plot_acf.md
@@ -0,0 +1,16 @@
+# plot_acf
+----------
+
+:: atom.plots:DataPlot.plot_acf
+    :: signature
+    :: head
+    :: table:
+        - parameters
+        - returns
+    :: see also
+
+<br>
+
+## Example
+
+:: examples
diff --git a/docs_sources/api/plots/plot_decomposition.md b/docs_sources/api/plots/plot_decomposition.md
new file mode 100644
index 000000000..dcc3a7dfa
--- /dev/null
+++ b/docs_sources/api/plots/plot_decomposition.md
@@ -0,0 +1,16 @@
+# plot_decomposition
+--------------------
+
+:: atom.plots:DataPlot.plot_decomposition
+    :: signature
+    :: head
+    :: table:
+        - parameters
+        - returns
+    :: see also
+
+<br>
+
+## Example
+
+:: examples
diff --git a/docs_sources/api/plots/plot_pacf.md b/docs_sources/api/plots/plot_pacf.md
new file mode 100644
index 000000000..b7edbe40a
--- /dev/null
+++ b/docs_sources/api/plots/plot_pacf.md
@@ -0,0 +1,16 @@
+# plot_pacf
+-----------
+
+:: atom.plots:DataPlot.plot_pacf
+    :: signature
+    :: head
+    :: table:
+        - parameters
+        - returns
+    :: see also
+
+<br>
+
+## Example
+
+:: examples
diff --git a/docs_sources/changelog/v5.x.x.md b/docs_sources/changelog/v5.x.x.md
index cfc1de2e4..a9674f0d4 100644
--- a/docs_sources/changelog/v5.x.x.md
+++ b/docs_sources/changelog/v5.x.x.md
@@ -1,65 +1,6 @@
 # Release history
 -----------------
 
-
-<a name="v600"></a>
-## Version 6.0.0
-
-**:star: New features**
-
-* Completely new module for time series. Read more in the [user guide][time-series].
-* Support for [Python 3.11](https://www.python.org/downloads/release/python-3110/) and drop support for [Python 3.8](https://www.python.org/downloads/release/python-380/)
-  and [Python 3.9](ttps://www.python.org/downloads/release/python-390/).
-* New data engines. Read more in the [user guide][data-acceleration].
-* Improved memory optimizations. Read more in the [user guide][memory-considerations].
-* Added the `iterative` strategy for [numerical imputation][imputer].
-* Added the `hdbscan` strategy to the [Pruner][] class.
-* Use the [`ignore`][atomclassifier-ignore] parameter to ignore columns in the dataset.
-* New [update_traces][atomclassifier-update_traces] method to further customize your plots.
-
-**:pencil: API changes**
-
-* The [plot_results][] method is divided into [plot_results][] and [plot_bootstrap][]
-  and accepts any metric.
-* The [FeatureGrouper][] class no longer accepts a `name` parameter. Provide
-  the group names directly through the `group` parameter as dict.
-* Rework of the [register][adaboost-register] method.
-* The `multioutput` attribute is deprecated. Multioutput meta-estimators are
-  now assigned automatically.
-* Model tags have to be separated from the acronym by an underscore.
-* The [`engine`][atomclassifier-engine] parameter is now a dict.
-* The `automl` method is deprecated.
-
-**:rocket: Enhancements**
-
-* Transformations only on `y` are now accepted, e.g., `atom.scale(columns=-1)`.
-* Full support for [pandas nullable dtypes](https://pandas.pydata.org/docs/user_guide/integer_na.html).
-* The dataset can now be provided as callable.
-* The [FeatureExtractor][] class can extract features from the dataset's index.
-* Subplots can now share axes on the [canvas][atomclassifier-canvas].
-* The [save][atomclassifier-save] and [save_data][atomclassifier-save_data]
-  methods now accept [pathlib.Path][] objects as `filename`.
-* Cleaner representation on hover for the [plot_timeline][] method.
-* The `cv` key in `ht_params` now accepts a custom cross-validation generator.
-* Improved error message for incorrect stratification of multioutput datasets.
-* Rework of the [shrink][atomclassifier-shrink] method.
-
-**:bug: Bug fixes**
-
-* Fixed a bug where the [cross_validate][adaboost-cross_validate] method could
-  fail for pipelines that changed the number of rows.
-* Fixed a bug where the [Pruner][] class didn't drop all outlier clusters.
-* Fixed a bug where the pipeline could fail for transformers that returned a
-  series.
-* Fixed a bug where the pipeline could fail for transformers that reset its
-  internal attributes during fitting.
-* Fixed a bug where the [register][adaboost-register] method failed in Databricks.
-* Fixed a bug where tuning hyperparameter for a `base_estimator` inside a custom
-  meta-estimator would fail.
-* Fixed a bug where the data properties' `@setter` could fail for numpy arrays.
-* Fixed a bug where reference lines for some plots didn't lie exactly on the unity line.
-
-
 <a name="v520"></a>
 ## Version 5.2.0
 
diff --git a/docs_sources/changelog/v6.x.x.md b/docs_sources/changelog/v6.x.x.md
new file mode 100644
index 000000000..a0b32c8a7
--- /dev/null
+++ b/docs_sources/changelog/v6.x.x.md
@@ -0,0 +1,59 @@
+# Release history
+-----------------
+
+<a name="v600"></a>
+## Version 6.0.0
+
+**:star: New features**
+
+* Completely new module for time series. Read more in the [user guide][time-series].
+* Support for [Python 3.11](https://www.python.org/downloads/release/python-3110/) and drop support for [Python 3.8](https://www.python.org/downloads/release/python-380/)
+  and [Python 3.9](ttps://www.python.org/downloads/release/python-390/).
+* New data engines. Read more in the [user guide][data-acceleration].
+* Improved memory optimizations. Read more in the [user guide][memory-considerations].
+* Added the `iterative` strategy for [numerical imputation][imputer].
+* Added the `hdbscan` strategy to the [Pruner][] class.
+* Use the [`ignore`][atomclassifier-ignore] parameter to ignore columns in the dataset.
+* New [update_traces][atomclassifier-update_traces] method to further customize your plots.
+
+**:pencil: API changes**
+
+* The [plot_results][] method is divided into [plot_results][] and [plot_bootstrap][]
+  and accepts any metric.
+* The [FeatureGrouper][] class no longer accepts a `name` parameter. Provide
+  the group names directly through the `group` parameter as dict.
+* Rework of the [register][adaboost-register] method.
+* The `multioutput` attribute is deprecated. Multioutput meta-estimators are
+  now assigned automatically.
+* Model tags have to be separated from the acronym by an underscore.
+* The [`engine`][atomclassifier-engine] parameter is now a dict.
+* The `automl` method is deprecated.
+
+**:rocket: Enhancements**
+
+* Transformations only on `y` are now accepted, e.g., `atom.scale(columns=-1)`.
+* Full support for [pandas nullable dtypes](https://pandas.pydata.org/docs/user_guide/integer_na.html).
+* The dataset can now be provided as callable.
+* The [FeatureExtractor][] class can extract features from the dataset's index.
+* Subplots can now share axes on the [canvas][atomclassifier-canvas].
+* The [save][atomclassifier-save] and [save_data][atomclassifier-save_data]
+  methods now accept [pathlib.Path][] objects as `filename`.
+* Cleaner representation on hover for the [plot_timeline][] method.
+* The `cv` key in `ht_params` now accepts a custom cross-validation generator.
+* Improved error message for incorrect stratification of multioutput datasets.
+* Rework of the [shrink][atomclassifier-shrink] method.
+
+**:bug: Bug fixes**
+
+* Fixed a bug where the [cross_validate][adaboost-cross_validate] method could
+  fail for pipelines that changed the number of rows.
+* Fixed a bug where the [Pruner][] class didn't drop all outlier clusters.
+* Fixed a bug where the pipeline could fail for transformers that returned a
+  series.
+* Fixed a bug where the pipeline could fail for transformers that reset its
+  internal attributes during fitting.
+* Fixed a bug where the [register][adaboost-register] method failed in Databricks.
+* Fixed a bug where tuning hyperparameter for a `base_estimator` inside a custom
+  meta-estimator would fail.
+* Fixed a bug where the data properties' `@setter` could fail for numpy arrays.
+* Fixed a bug where reference lines for some plots didn't lie exactly on the unity line.
diff --git a/docs_sources/dependencies.md b/docs_sources/dependencies.md
index 513dabbef..f9fcd089e 100644
--- a/docs_sources/dependencies.md
+++ b/docs_sources/dependencies.md
@@ -41,7 +41,7 @@ packages are necessary for its correct functioning.
 * **[numpy](https://numpy.org/)** (>=1.23.0)
 * **[optuna](https://optuna.org/)** (>=3.4.0)
 * **[pandas[parquet]](https://pandas.pydata.org/)** (>=2.1.2)
-* **[plotly](https://plotly.com/python/)** (>=5.15.0)
+* **[plotly](https://plotly.com/python/)** (>=5.18.0)
 * **[ray[serve]](https://docs.ray.io/en/latest/)** (>=2.7.1)
 * **[requests](https://requests.readthedocs.io/en/latest/)** (>=2.31.0)
 * **[scikit-learn](https://scikit-learn.org/stable/)** (>=1.4.0)
diff --git a/mkdocs.yml b/mkdocs.yml
index 2c484c99a..1df50d295 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -233,11 +233,13 @@ nav:
         - Pipeline:
             - Pipeline: API/pipeline/pipeline.md
         - Plots:
+            - plot_acf: API/plots/plot_acf.md
             - plot_bootstrap: API/plots/plot_bootstrap.md
             - plot_calibration: API/plots/plot_calibration.md
             - plot_components: API/plots/plot_components.md
             - plot_confusion_matrix: API/plots/plot_confusion_matrix.md
             - plot_correlation: API/plots/plot_correlation.md
+            - plot_decomposition: API/plots/plot_decomposition.md
             - plot_det: API/plots/plot_det.md
             - plot_distribution: API/plots/plot_distribution.md
             - plot_edf: API/plots/plot_edf.md
@@ -251,6 +253,7 @@ nav:
             - plot_learning_curve: API/plots/plot_learning_curve.md
             - plot_lift: API/plots/plot_lift.md
             - plot_ngrams: API/plots/plot_ngrams.md
+            - plot_pacf: API/plots/plot_pacf.md
             - plot_parallel_coordinate: API/plots/plot_parallel_coordinate.md
             - plot_pareto_front: API/plots/plot_pareto_front.md
             - plot_parshap: API/plots/plot_parshap.md
diff --git a/pyproject.toml b/pyproject.toml
index d79784634..f34a6e136 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ dependencies = [
     "numpy>=1.23.0",
     "optuna>=3.4.0",
     "pandas[parquet]>=2.1.2",
-    "plotly>=5.15.0",
+    "plotly>=5.18.0",
     "ray[serve]>=2.7.1",
     "requests>=2.31.0",
     "scikit-learn>=1.4.0",
diff --git a/tests/test_plots.py b/tests/test_plots.py
index 163e5b68e..1ce62ca70 100644
--- a/tests/test_plots.py
+++ b/tests/test_plots.py
@@ -275,6 +275,12 @@ def test_update_traces():
 
 # Test DataPlot ==================================================== >>
 
+def test_plot_acf():
+    """Assert that the plot_acf method works."""
+    atom = ATOMForecaster(y_fc, random_state=1)
+    atom.plot_acf(display=False)
+
+
 @pytest.mark.parametrize("show", [10, None])
 def test_plot_components(show):
     """Assert that the plot_components method works."""
@@ -311,6 +317,12 @@ def test_plot_ngrams(ngram):
     atom.plot_ngrams(ngram=ngram, display=False)  # When the corpus consists of tokens
 
 
+def test_plot_pacf():
+    """Assert that the plot_pacf method works."""
+    atom = ATOMForecaster(y_fc, random_state=1)
+    atom.plot_pacf(display=False)
+
+
 @pytest.mark.parametrize("X", [X10, X_sparse])
 def test_plot_pca(X):
     """Assert that the plot_pca method works."""